1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 #ifndef NDEBUG
161 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
162 #endif
163 
164 /// @{
165 /// Metadata attribute names
166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167 const char LLVMLoopVectorizeFollowupVectorized[] =
168     "llvm.loop.vectorize.followup_vectorized";
169 const char LLVMLoopVectorizeFollowupEpilogue[] =
170     "llvm.loop.vectorize.followup_epilogue";
171 /// @}
172 
173 STATISTIC(LoopsVectorized, "Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176 
177 static cl::opt<bool> EnableEpilogueVectorization(
178     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179     cl::desc("Enable vectorization of epilogue loops."));
180 
181 static cl::opt<unsigned> EpilogueVectorizationForceVF(
182     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183     cl::desc("When epilogue vectorization is enabled, and a value greater than "
184              "1 is specified, forces the given VF for all applicable epilogue "
185              "loops."));
186 
187 static cl::opt<unsigned> EpilogueVectorizationMinVF(
188     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189     cl::desc("Only loops with vectorization factor equal to or larger than "
190              "the specified value are considered for epilogue vectorization."));
191 
192 /// Loops with a known constant trip count below this number are vectorized only
193 /// if no scalar iteration overheads are incurred.
194 static cl::opt<unsigned> TinyTripCountVectorThreshold(
195     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196     cl::desc("Loops with a constant trip count that is smaller than this "
197              "value are vectorized only if no scalar iteration overheads "
198              "are incurred."));
199 
200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
201 // that predication is preferred, and this lists all options. I.e., the
202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
203 // and predicate the instructions accordingly. If tail-folding fails, there are
204 // different fallback strategies depending on these values:
205 namespace PreferPredicateTy {
206   enum Option {
207     ScalarEpilogue = 0,
208     PredicateElseScalarEpilogue,
209     PredicateOrDontVectorize
210   };
211 } // namespace PreferPredicateTy
212 
213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
214     "prefer-predicate-over-epilogue",
215     cl::init(PreferPredicateTy::ScalarEpilogue),
216     cl::Hidden,
217     cl::desc("Tail-folding and predication preferences over creating a scalar "
218              "epilogue loop."),
219     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
220                          "scalar-epilogue",
221                          "Don't tail-predicate loops, create scalar epilogue"),
222               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
223                          "predicate-else-scalar-epilogue",
224                          "prefer tail-folding, create scalar epilogue if tail "
225                          "folding fails."),
226               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
227                          "predicate-dont-vectorize",
228                          "prefers tail-folding, don't attempt vectorization if "
229                          "tail-folding fails.")));
230 
231 static cl::opt<bool> MaximizeBandwidth(
232     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
233     cl::desc("Maximize bandwidth when selecting vectorization factor which "
234              "will be determined by the smallest type in loop."));
235 
236 static cl::opt<bool> EnableInterleavedMemAccesses(
237     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
238     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
239 
240 /// An interleave-group may need masking if it resides in a block that needs
241 /// predication, or in order to mask away gaps.
242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
243     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
245 
246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
247     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
248     cl::desc("We don't interleave loops with a estimated constant trip count "
249              "below this number"));
250 
251 static cl::opt<unsigned> ForceTargetNumScalarRegs(
252     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
253     cl::desc("A flag that overrides the target's number of scalar registers."));
254 
255 static cl::opt<unsigned> ForceTargetNumVectorRegs(
256     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of vector registers."));
258 
259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
260     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's max interleave factor for "
262              "scalar loops."));
263 
264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
265     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
266     cl::desc("A flag that overrides the target's max interleave factor for "
267              "vectorized loops."));
268 
269 static cl::opt<unsigned> ForceTargetInstructionCost(
270     "force-target-instruction-cost", cl::init(0), cl::Hidden,
271     cl::desc("A flag that overrides the target's expected cost for "
272              "an instruction to a single constant value. Mostly "
273              "useful for getting consistent testing."));
274 
275 static cl::opt<unsigned> SmallLoopCost(
276     "small-loop-cost", cl::init(20), cl::Hidden,
277     cl::desc(
278         "The cost of a loop that is considered 'small' by the interleaver."));
279 
280 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
281     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
282     cl::desc("Enable the use of the block frequency analysis to access PGO "
283              "heuristics minimizing code growth in cold regions and being more "
284              "aggressive in hot regions."));
285 
286 // Runtime interleave loops for load/store throughput.
287 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
288     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
289     cl::desc(
290         "Enable runtime interleaving until load/store ports are saturated"));
291 
292 /// Interleave small loops with scalar reductions.
293 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
294     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
295     cl::desc("Enable interleaving for loops with small iteration counts that "
296              "contain scalar reductions to expose ILP."));
297 
298 /// The number of stores in a loop that are allowed to need predication.
299 static cl::opt<unsigned> NumberOfStoresToPredicate(
300     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
301     cl::desc("Max number of stores to be predicated behind an if."));
302 
303 static cl::opt<bool> EnableIndVarRegisterHeur(
304     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
305     cl::desc("Count the induction variable only once when interleaving"));
306 
307 static cl::opt<bool> EnableCondStoresVectorization(
308     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
309     cl::desc("Enable if predication of stores during vectorization."));
310 
311 static cl::opt<unsigned> MaxNestedScalarReductionIC(
312     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
313     cl::desc("The maximum interleave count to use when interleaving a scalar "
314              "reduction in a nested loop."));
315 
316 static cl::opt<bool>
317     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
318                            cl::Hidden,
319                            cl::desc("Prefer in-loop vector reductions, "
320                                     "overriding the targets preference."));
321 
322 static cl::opt<bool> PreferPredicatedReductionSelect(
323     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
324     cl::desc(
325         "Prefer predicating a reduction operation over an after loop select."));
326 
327 cl::opt<bool> EnableVPlanNativePath(
328     "enable-vplan-native-path", cl::init(false), cl::Hidden,
329     cl::desc("Enable VPlan-native vectorization path with "
330              "support for outer loop vectorization."));
331 
332 // FIXME: Remove this switch once we have divergence analysis. Currently we
333 // assume divergent non-backedge branches when this switch is true.
334 cl::opt<bool> EnableVPlanPredication(
335     "enable-vplan-predication", cl::init(false), cl::Hidden,
336     cl::desc("Enable VPlan-native vectorization path predicator with "
337              "support for outer loop vectorization."));
338 
339 // This flag enables the stress testing of the VPlan H-CFG construction in the
340 // VPlan-native vectorization path. It must be used in conjuction with
341 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
342 // verification of the H-CFGs built.
343 static cl::opt<bool> VPlanBuildStressTest(
344     "vplan-build-stress-test", cl::init(false), cl::Hidden,
345     cl::desc(
346         "Build VPlan for every supported loop nest in the function and bail "
347         "out right after the build (stress test the VPlan H-CFG construction "
348         "in the VPlan-native vectorization path)."));
349 
350 cl::opt<bool> llvm::EnableLoopInterleaving(
351     "interleave-loops", cl::init(true), cl::Hidden,
352     cl::desc("Enable loop interleaving in Loop vectorization passes"));
353 cl::opt<bool> llvm::EnableLoopVectorization(
354     "vectorize-loops", cl::init(true), cl::Hidden,
355     cl::desc("Run the Loop vectorization passes"));
356 
357 /// A helper function that returns the type of loaded or stored value.
358 static Type *getMemInstValueType(Value *I) {
359   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
360          "Expected Load or Store instruction");
361   if (auto *LI = dyn_cast<LoadInst>(I))
362     return LI->getType();
363   return cast<StoreInst>(I)->getValueOperand()->getType();
364 }
365 
366 /// A helper function that returns true if the given type is irregular. The
367 /// type is irregular if its allocated size doesn't equal the store size of an
368 /// element of the corresponding vector type at the given vectorization factor.
369 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
370   // Determine if an array of VF elements of type Ty is "bitcast compatible"
371   // with a <VF x Ty> vector.
372   if (VF.isVector()) {
373     auto *VectorTy = VectorType::get(Ty, VF);
374     return TypeSize::get(VF.getKnownMinValue() *
375                              DL.getTypeAllocSize(Ty).getFixedValue(),
376                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
377   }
378 
379   // If the vectorization factor is one, we just check if an array of type Ty
380   // requires padding between elements.
381   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
382 }
383 
384 /// A helper function that returns the reciprocal of the block probability of
385 /// predicated blocks. If we return X, we are assuming the predicated block
386 /// will execute once for every X iterations of the loop header.
387 ///
388 /// TODO: We should use actual block probability here, if available. Currently,
389 ///       we always assume predicated blocks have a 50% chance of executing.
390 static unsigned getReciprocalPredBlockProb() { return 2; }
391 
392 /// A helper function that adds a 'fast' flag to floating-point operations.
393 static Value *addFastMathFlag(Value *V) {
394   if (isa<FPMathOperator>(V))
395     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
396   return V;
397 }
398 
399 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
400   if (isa<FPMathOperator>(V))
401     cast<Instruction>(V)->setFastMathFlags(FMF);
402   return V;
403 }
404 
405 /// A helper function that returns an integer or floating-point constant with
406 /// value C.
407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
408   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
409                            : ConstantFP::get(Ty, C);
410 }
411 
412 /// Returns "best known" trip count for the specified loop \p L as defined by
413 /// the following procedure:
414 ///   1) Returns exact trip count if it is known.
415 ///   2) Returns expected trip count according to profile data if any.
416 ///   3) Returns upper bound estimate if it is known.
417 ///   4) Returns None if all of the above failed.
418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
419   // Check if exact trip count is known.
420   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
421     return ExpectedTC;
422 
423   // Check if there is an expected trip count available from profile data.
424   if (LoopVectorizeWithBlockFrequency)
425     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
426       return EstimatedTC;
427 
428   // Check if upper bound estimate is known.
429   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
430     return ExpectedTC;
431 
432   return None;
433 }
434 
435 namespace llvm {
436 
437 /// InnerLoopVectorizer vectorizes loops which contain only one basic
438 /// block to a specified vectorization factor (VF).
439 /// This class performs the widening of scalars into vectors, or multiple
440 /// scalars. This class also implements the following features:
441 /// * It inserts an epilogue loop for handling loops that don't have iteration
442 ///   counts that are known to be a multiple of the vectorization factor.
443 /// * It handles the code generation for reduction variables.
444 /// * Scalarization (implementation using scalars) of un-vectorizable
445 ///   instructions.
446 /// InnerLoopVectorizer does not perform any vectorization-legality
447 /// checks, and relies on the caller to check for the different legality
448 /// aspects. The InnerLoopVectorizer relies on the
449 /// LoopVectorizationLegality class to provide information about the induction
450 /// and reduction variables that were found to a given vectorization factor.
451 class InnerLoopVectorizer {
452 public:
453   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
454                       LoopInfo *LI, DominatorTree *DT,
455                       const TargetLibraryInfo *TLI,
456                       const TargetTransformInfo *TTI, AssumptionCache *AC,
457                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
458                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
459                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
460                       ProfileSummaryInfo *PSI)
461       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
462         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
463         Builder(PSE.getSE()->getContext()),
464         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
465         BFI(BFI), PSI(PSI) {
466     // Query this against the original loop and save it here because the profile
467     // of the original loop header may change as the transformation happens.
468     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
469         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
470   }
471 
472   virtual ~InnerLoopVectorizer() = default;
473 
474   /// Create a new empty loop that will contain vectorized instructions later
475   /// on, while the old loop will be used as the scalar remainder. Control flow
476   /// is generated around the vectorized (and scalar epilogue) loops consisting
477   /// of various checks and bypasses. Return the pre-header block of the new
478   /// loop.
479   /// In the case of epilogue vectorization, this function is overriden to
480   /// handle the more complex control flow around the loops.
481   virtual BasicBlock *createVectorizedLoopSkeleton();
482 
483   /// Widen a single instruction within the innermost loop.
484   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
485                         VPTransformState &State);
486 
487   /// Widen a single call instruction within the innermost loop.
488   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
489                             VPTransformState &State);
490 
491   /// Widen a single select instruction within the innermost loop.
492   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
493                               bool InvariantCond, VPTransformState &State);
494 
495   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
496   void fixVectorizedLoop();
497 
498   // Return true if any runtime check is added.
499   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
500 
501   /// A type for vectorized values in the new loop. Each value from the
502   /// original loop, when vectorized, is represented by UF vector values in the
503   /// new unrolled loop, where UF is the unroll factor.
504   using VectorParts = SmallVector<Value *, 2>;
505 
506   /// Vectorize a single GetElementPtrInst based on information gathered and
507   /// decisions taken during planning.
508   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
509                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
510                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
511 
512   /// Vectorize a single PHINode in a block. This method handles the induction
513   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
514   /// arbitrary length vectors.
515   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
516 
517   /// A helper function to scalarize a single Instruction in the innermost loop.
518   /// Generates a sequence of scalar instances for each lane between \p MinLane
519   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
520   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
521   /// Instr's operands.
522   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
523                             const VPIteration &Instance, bool IfPredicateInstr,
524                             VPTransformState &State);
525 
526   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
527   /// is provided, the integer induction variable will first be truncated to
528   /// the corresponding type.
529   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
530 
531   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
532   /// vector or scalar value on-demand if one is not yet available. When
533   /// vectorizing a loop, we visit the definition of an instruction before its
534   /// uses. When visiting the definition, we either vectorize or scalarize the
535   /// instruction, creating an entry for it in the corresponding map. (In some
536   /// cases, such as induction variables, we will create both vector and scalar
537   /// entries.) Then, as we encounter uses of the definition, we derive values
538   /// for each scalar or vector use unless such a value is already available.
539   /// For example, if we scalarize a definition and one of its uses is vector,
540   /// we build the required vector on-demand with an insertelement sequence
541   /// when visiting the use. Otherwise, if the use is scalar, we can use the
542   /// existing scalar definition.
543   ///
544   /// Return a value in the new loop corresponding to \p V from the original
545   /// loop at unroll index \p Part. If the value has already been vectorized,
546   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
547   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
548   /// a new vector value on-demand by inserting the scalar values into a vector
549   /// with an insertelement sequence. If the value has been neither vectorized
550   /// nor scalarized, it must be loop invariant, so we simply broadcast the
551   /// value into a vector.
552   Value *getOrCreateVectorValue(Value *V, unsigned Part);
553 
554   void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
555     VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
556   }
557 
558   /// Return a value in the new loop corresponding to \p V from the original
559   /// loop at unroll and vector indices \p Instance. If the value has been
560   /// vectorized but not scalarized, the necessary extractelement instruction
561   /// will be generated.
562   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
563 
564   /// Construct the vector value of a scalarized value \p V one lane at a time.
565   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
566 
567   /// Try to vectorize interleaved access group \p Group with the base address
568   /// given in \p Addr, optionally masking the vector operations if \p
569   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
570   /// values in the vectorized loop.
571   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
572                                 ArrayRef<VPValue *> VPDefs,
573                                 VPTransformState &State, VPValue *Addr,
574                                 ArrayRef<VPValue *> StoredValues,
575                                 VPValue *BlockInMask = nullptr);
576 
577   /// Vectorize Load and Store instructions with the base address given in \p
578   /// Addr, optionally masking the vector operations if \p BlockInMask is
579   /// non-null. Use \p State to translate given VPValues to IR values in the
580   /// vectorized loop.
581   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
582                                   VPValue *Def, VPValue *Addr,
583                                   VPValue *StoredValue, VPValue *BlockInMask);
584 
585   /// Set the debug location in the builder using the debug location in
586   /// the instruction.
587   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
588 
589   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
590   void fixNonInductionPHIs(void);
591 
592 protected:
593   friend class LoopVectorizationPlanner;
594 
595   /// A small list of PHINodes.
596   using PhiVector = SmallVector<PHINode *, 4>;
597 
598   /// A type for scalarized values in the new loop. Each value from the
599   /// original loop, when scalarized, is represented by UF x VF scalar values
600   /// in the new unrolled loop, where UF is the unroll factor and VF is the
601   /// vectorization factor.
602   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
603 
604   /// Set up the values of the IVs correctly when exiting the vector loop.
605   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
606                     Value *CountRoundDown, Value *EndValue,
607                     BasicBlock *MiddleBlock);
608 
609   /// Create a new induction variable inside L.
610   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
611                                    Value *Step, Instruction *DL);
612 
613   /// Handle all cross-iteration phis in the header.
614   void fixCrossIterationPHIs();
615 
616   /// Fix a first-order recurrence. This is the second phase of vectorizing
617   /// this phi node.
618   void fixFirstOrderRecurrence(PHINode *Phi);
619 
620   /// Fix a reduction cross-iteration phi. This is the second phase of
621   /// vectorizing this phi node.
622   void fixReduction(PHINode *Phi);
623 
624   /// Clear NSW/NUW flags from reduction instructions if necessary.
625   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
626 
627   /// The Loop exit block may have single value PHI nodes with some
628   /// incoming value. While vectorizing we only handled real values
629   /// that were defined inside the loop and we should have one value for
630   /// each predecessor of its parent basic block. See PR14725.
631   void fixLCSSAPHIs();
632 
633   /// Iteratively sink the scalarized operands of a predicated instruction into
634   /// the block that was created for it.
635   void sinkScalarOperands(Instruction *PredInst);
636 
637   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
638   /// represented as.
639   void truncateToMinimalBitwidths();
640 
641   /// Create a broadcast instruction. This method generates a broadcast
642   /// instruction (shuffle) for loop invariant values and for the induction
643   /// value. If this is the induction variable then we extend it to N, N+1, ...
644   /// this is needed because each iteration in the loop corresponds to a SIMD
645   /// element.
646   virtual Value *getBroadcastInstrs(Value *V);
647 
648   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
649   /// to each vector element of Val. The sequence starts at StartIndex.
650   /// \p Opcode is relevant for FP induction variable.
651   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
652                                Instruction::BinaryOps Opcode =
653                                Instruction::BinaryOpsEnd);
654 
655   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
656   /// variable on which to base the steps, \p Step is the size of the step, and
657   /// \p EntryVal is the value from the original loop that maps to the steps.
658   /// Note that \p EntryVal doesn't have to be an induction variable - it
659   /// can also be a truncate instruction.
660   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
661                         const InductionDescriptor &ID);
662 
663   /// Create a vector induction phi node based on an existing scalar one. \p
664   /// EntryVal is the value from the original loop that maps to the vector phi
665   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
666   /// truncate instruction, instead of widening the original IV, we widen a
667   /// version of the IV truncated to \p EntryVal's type.
668   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
669                                        Value *Step, Instruction *EntryVal);
670 
671   /// Returns true if an instruction \p I should be scalarized instead of
672   /// vectorized for the chosen vectorization factor.
673   bool shouldScalarizeInstruction(Instruction *I) const;
674 
675   /// Returns true if we should generate a scalar version of \p IV.
676   bool needsScalarInduction(Instruction *IV) const;
677 
678   /// If there is a cast involved in the induction variable \p ID, which should
679   /// be ignored in the vectorized loop body, this function records the
680   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
681   /// cast. We had already proved that the casted Phi is equal to the uncasted
682   /// Phi in the vectorized loop (under a runtime guard), and therefore
683   /// there is no need to vectorize the cast - the same value can be used in the
684   /// vector loop for both the Phi and the cast.
685   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
686   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
687   ///
688   /// \p EntryVal is the value from the original loop that maps to the vector
689   /// phi node and is used to distinguish what is the IV currently being
690   /// processed - original one (if \p EntryVal is a phi corresponding to the
691   /// original IV) or the "newly-created" one based on the proof mentioned above
692   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
693   /// latter case \p EntryVal is a TruncInst and we must not record anything for
694   /// that IV, but it's error-prone to expect callers of this routine to care
695   /// about that, hence this explicit parameter.
696   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
697                                              const Instruction *EntryVal,
698                                              Value *VectorLoopValue,
699                                              unsigned Part,
700                                              unsigned Lane = UINT_MAX);
701 
702   /// Generate a shuffle sequence that will reverse the vector Vec.
703   virtual Value *reverseVector(Value *Vec);
704 
705   /// Returns (and creates if needed) the original loop trip count.
706   Value *getOrCreateTripCount(Loop *NewLoop);
707 
708   /// Returns (and creates if needed) the trip count of the widened loop.
709   Value *getOrCreateVectorTripCount(Loop *NewLoop);
710 
711   /// Returns a bitcasted value to the requested vector type.
712   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
713   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
714                                 const DataLayout &DL);
715 
716   /// Emit a bypass check to see if the vector trip count is zero, including if
717   /// it overflows.
718   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
719 
720   /// Emit a bypass check to see if all of the SCEV assumptions we've
721   /// had to make are correct.
722   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
723 
724   /// Emit bypass checks to check any memory assumptions we may have made.
725   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
726 
727   /// Compute the transformed value of Index at offset StartValue using step
728   /// StepValue.
729   /// For integer induction, returns StartValue + Index * StepValue.
730   /// For pointer induction, returns StartValue[Index * StepValue].
731   /// FIXME: The newly created binary instructions should contain nsw/nuw
732   /// flags, which can be found from the original scalar operations.
733   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
734                               const DataLayout &DL,
735                               const InductionDescriptor &ID) const;
736 
737   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
738   /// vector loop preheader, middle block and scalar preheader. Also
739   /// allocate a loop object for the new vector loop and return it.
740   Loop *createVectorLoopSkeleton(StringRef Prefix);
741 
742   /// Create new phi nodes for the induction variables to resume iteration count
743   /// in the scalar epilogue, from where the vectorized loop left off (given by
744   /// \p VectorTripCount).
745   /// In cases where the loop skeleton is more complicated (eg. epilogue
746   /// vectorization) and the resume values can come from an additional bypass
747   /// block, the \p AdditionalBypass pair provides information about the bypass
748   /// block and the end value on the edge from bypass to this loop.
749   void createInductionResumeValues(
750       Loop *L, Value *VectorTripCount,
751       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
752 
753   /// Complete the loop skeleton by adding debug MDs, creating appropriate
754   /// conditional branches in the middle block, preparing the builder and
755   /// running the verifier. Take in the vector loop \p L as argument, and return
756   /// the preheader of the completed vector loop.
757   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
758 
759   /// Add additional metadata to \p To that was not present on \p Orig.
760   ///
761   /// Currently this is used to add the noalias annotations based on the
762   /// inserted memchecks.  Use this for instructions that are *cloned* into the
763   /// vector loop.
764   void addNewMetadata(Instruction *To, const Instruction *Orig);
765 
766   /// Add metadata from one instruction to another.
767   ///
768   /// This includes both the original MDs from \p From and additional ones (\see
769   /// addNewMetadata).  Use this for *newly created* instructions in the vector
770   /// loop.
771   void addMetadata(Instruction *To, Instruction *From);
772 
773   /// Similar to the previous function but it adds the metadata to a
774   /// vector of instructions.
775   void addMetadata(ArrayRef<Value *> To, Instruction *From);
776 
777   /// Allow subclasses to override and print debug traces before/after vplan
778   /// execution, when trace information is requested.
779   virtual void printDebugTracesAtStart(){};
780   virtual void printDebugTracesAtEnd(){};
781 
782   /// The original loop.
783   Loop *OrigLoop;
784 
785   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
786   /// dynamic knowledge to simplify SCEV expressions and converts them to a
787   /// more usable form.
788   PredicatedScalarEvolution &PSE;
789 
790   /// Loop Info.
791   LoopInfo *LI;
792 
793   /// Dominator Tree.
794   DominatorTree *DT;
795 
796   /// Alias Analysis.
797   AAResults *AA;
798 
799   /// Target Library Info.
800   const TargetLibraryInfo *TLI;
801 
802   /// Target Transform Info.
803   const TargetTransformInfo *TTI;
804 
805   /// Assumption Cache.
806   AssumptionCache *AC;
807 
808   /// Interface to emit optimization remarks.
809   OptimizationRemarkEmitter *ORE;
810 
811   /// LoopVersioning.  It's only set up (non-null) if memchecks were
812   /// used.
813   ///
814   /// This is currently only used to add no-alias metadata based on the
815   /// memchecks.  The actually versioning is performed manually.
816   std::unique_ptr<LoopVersioning> LVer;
817 
818   /// The vectorization SIMD factor to use. Each vector will have this many
819   /// vector elements.
820   ElementCount VF;
821 
822   /// The vectorization unroll factor to use. Each scalar is vectorized to this
823   /// many different vector instructions.
824   unsigned UF;
825 
826   /// The builder that we use
827   IRBuilder<> Builder;
828 
829   // --- Vectorization state ---
830 
831   /// The vector-loop preheader.
832   BasicBlock *LoopVectorPreHeader;
833 
834   /// The scalar-loop preheader.
835   BasicBlock *LoopScalarPreHeader;
836 
837   /// Middle Block between the vector and the scalar.
838   BasicBlock *LoopMiddleBlock;
839 
840   /// The (unique) ExitBlock of the scalar loop.  Note that
841   /// there can be multiple exiting edges reaching this block.
842   BasicBlock *LoopExitBlock;
843 
844   /// The vector loop body.
845   BasicBlock *LoopVectorBody;
846 
847   /// The scalar loop body.
848   BasicBlock *LoopScalarBody;
849 
850   /// A list of all bypass blocks. The first block is the entry of the loop.
851   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
852 
853   /// The new Induction variable which was added to the new block.
854   PHINode *Induction = nullptr;
855 
856   /// The induction variable of the old basic block.
857   PHINode *OldInduction = nullptr;
858 
859   /// Maps values from the original loop to their corresponding values in the
860   /// vectorized loop. A key value can map to either vector values, scalar
861   /// values or both kinds of values, depending on whether the key was
862   /// vectorized and scalarized.
863   VectorizerValueMap VectorLoopValueMap;
864 
865   /// Store instructions that were predicated.
866   SmallVector<Instruction *, 4> PredicatedInstructions;
867 
868   /// Trip count of the original loop.
869   Value *TripCount = nullptr;
870 
871   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
872   Value *VectorTripCount = nullptr;
873 
874   /// The legality analysis.
875   LoopVectorizationLegality *Legal;
876 
877   /// The profitablity analysis.
878   LoopVectorizationCostModel *Cost;
879 
880   // Record whether runtime checks are added.
881   bool AddedSafetyChecks = false;
882 
883   // Holds the end values for each induction variable. We save the end values
884   // so we can later fix-up the external users of the induction variables.
885   DenseMap<PHINode *, Value *> IVEndValues;
886 
887   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
888   // fixed up at the end of vector code generation.
889   SmallVector<PHINode *, 8> OrigPHIsToFix;
890 
891   /// BFI and PSI are used to check for profile guided size optimizations.
892   BlockFrequencyInfo *BFI;
893   ProfileSummaryInfo *PSI;
894 
895   // Whether this loop should be optimized for size based on profile guided size
896   // optimizatios.
897   bool OptForSizeBasedOnProfile;
898 };
899 
900 class InnerLoopUnroller : public InnerLoopVectorizer {
901 public:
902   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
903                     LoopInfo *LI, DominatorTree *DT,
904                     const TargetLibraryInfo *TLI,
905                     const TargetTransformInfo *TTI, AssumptionCache *AC,
906                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
907                     LoopVectorizationLegality *LVL,
908                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
909                     ProfileSummaryInfo *PSI)
910       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
911                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
912                             BFI, PSI) {}
913 
914 private:
915   Value *getBroadcastInstrs(Value *V) override;
916   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
917                        Instruction::BinaryOps Opcode =
918                        Instruction::BinaryOpsEnd) override;
919   Value *reverseVector(Value *Vec) override;
920 };
921 
922 /// Encapsulate information regarding vectorization of a loop and its epilogue.
923 /// This information is meant to be updated and used across two stages of
924 /// epilogue vectorization.
925 struct EpilogueLoopVectorizationInfo {
926   ElementCount MainLoopVF = ElementCount::getFixed(0);
927   unsigned MainLoopUF = 0;
928   ElementCount EpilogueVF = ElementCount::getFixed(0);
929   unsigned EpilogueUF = 0;
930   BasicBlock *MainLoopIterationCountCheck = nullptr;
931   BasicBlock *EpilogueIterationCountCheck = nullptr;
932   BasicBlock *SCEVSafetyCheck = nullptr;
933   BasicBlock *MemSafetyCheck = nullptr;
934   Value *TripCount = nullptr;
935   Value *VectorTripCount = nullptr;
936 
937   EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
938                                 unsigned EUF)
939       : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
940         EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
941     assert(EUF == 1 &&
942            "A high UF for the epilogue loop is likely not beneficial.");
943   }
944 };
945 
946 /// An extension of the inner loop vectorizer that creates a skeleton for a
947 /// vectorized loop that has its epilogue (residual) also vectorized.
948 /// The idea is to run the vplan on a given loop twice, firstly to setup the
949 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
950 /// from the first step and vectorize the epilogue.  This is achieved by
951 /// deriving two concrete strategy classes from this base class and invoking
952 /// them in succession from the loop vectorizer planner.
953 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
954 public:
955   InnerLoopAndEpilogueVectorizer(
956       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
957       DominatorTree *DT, const TargetLibraryInfo *TLI,
958       const TargetTransformInfo *TTI, AssumptionCache *AC,
959       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
960       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
961       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
962       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
963                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
964         EPI(EPI) {}
965 
966   // Override this function to handle the more complex control flow around the
967   // three loops.
968   BasicBlock *createVectorizedLoopSkeleton() final override {
969     return createEpilogueVectorizedLoopSkeleton();
970   }
971 
972   /// The interface for creating a vectorized skeleton using one of two
973   /// different strategies, each corresponding to one execution of the vplan
974   /// as described above.
975   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
976 
977   /// Holds and updates state information required to vectorize the main loop
978   /// and its epilogue in two separate passes. This setup helps us avoid
979   /// regenerating and recomputing runtime safety checks. It also helps us to
980   /// shorten the iteration-count-check path length for the cases where the
981   /// iteration count of the loop is so small that the main vector loop is
982   /// completely skipped.
983   EpilogueLoopVectorizationInfo &EPI;
984 };
985 
986 /// A specialized derived class of inner loop vectorizer that performs
987 /// vectorization of *main* loops in the process of vectorizing loops and their
988 /// epilogues.
989 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
990 public:
991   EpilogueVectorizerMainLoop(
992       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
993       DominatorTree *DT, const TargetLibraryInfo *TLI,
994       const TargetTransformInfo *TTI, AssumptionCache *AC,
995       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
996       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
997       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
998       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
999                                        EPI, LVL, CM, BFI, PSI) {}
1000   /// Implements the interface for creating a vectorized skeleton using the
1001   /// *main loop* strategy (ie the first pass of vplan execution).
1002   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1003 
1004 protected:
1005   /// Emits an iteration count bypass check once for the main loop (when \p
1006   /// ForEpilogue is false) and once for the epilogue loop (when \p
1007   /// ForEpilogue is true).
1008   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
1009                                              bool ForEpilogue);
1010   void printDebugTracesAtStart() override;
1011   void printDebugTracesAtEnd() override;
1012 };
1013 
1014 // A specialized derived class of inner loop vectorizer that performs
1015 // vectorization of *epilogue* loops in the process of vectorizing loops and
1016 // their epilogues.
1017 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
1018 public:
1019   EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
1020                     LoopInfo *LI, DominatorTree *DT,
1021                     const TargetLibraryInfo *TLI,
1022                     const TargetTransformInfo *TTI, AssumptionCache *AC,
1023                     OptimizationRemarkEmitter *ORE,
1024                     EpilogueLoopVectorizationInfo &EPI,
1025                     LoopVectorizationLegality *LVL,
1026                     llvm::LoopVectorizationCostModel *CM,
1027                     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
1028       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1029                                        EPI, LVL, CM, BFI, PSI) {}
1030   /// Implements the interface for creating a vectorized skeleton using the
1031   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1032   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1033 
1034 protected:
1035   /// Emits an iteration count bypass check after the main vector loop has
1036   /// finished to see if there are any iterations left to execute by either
1037   /// the vector epilogue or the scalar epilogue.
1038   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1039                                                       BasicBlock *Bypass,
1040                                                       BasicBlock *Insert);
1041   void printDebugTracesAtStart() override;
1042   void printDebugTracesAtEnd() override;
1043 };
1044 } // end namespace llvm
1045 
1046 /// Look for a meaningful debug location on the instruction or it's
1047 /// operands.
1048 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1049   if (!I)
1050     return I;
1051 
1052   DebugLoc Empty;
1053   if (I->getDebugLoc() != Empty)
1054     return I;
1055 
1056   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
1057     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
1058       if (OpInst->getDebugLoc() != Empty)
1059         return OpInst;
1060   }
1061 
1062   return I;
1063 }
1064 
1065 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1066   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1067     const DILocation *DIL = Inst->getDebugLoc();
1068     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1069         !isa<DbgInfoIntrinsic>(Inst)) {
1070       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1071       auto NewDIL =
1072           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1073       if (NewDIL)
1074         B.SetCurrentDebugLocation(NewDIL.getValue());
1075       else
1076         LLVM_DEBUG(dbgs()
1077                    << "Failed to create new discriminator: "
1078                    << DIL->getFilename() << " Line: " << DIL->getLine());
1079     }
1080     else
1081       B.SetCurrentDebugLocation(DIL);
1082   } else
1083     B.SetCurrentDebugLocation(DebugLoc());
1084 }
1085 
1086 /// Write a record \p DebugMsg about vectorization failure to the debug
1087 /// output stream. If \p I is passed, it is an instruction that prevents
1088 /// vectorization.
1089 #ifndef NDEBUG
1090 static void debugVectorizationFailure(const StringRef DebugMsg,
1091     Instruction *I) {
1092   dbgs() << "LV: Not vectorizing: " << DebugMsg;
1093   if (I != nullptr)
1094     dbgs() << " " << *I;
1095   else
1096     dbgs() << '.';
1097   dbgs() << '\n';
1098 }
1099 #endif
1100 
1101 /// Create an analysis remark that explains why vectorization failed
1102 ///
1103 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1104 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1105 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1106 /// the location of the remark.  \return the remark object that can be
1107 /// streamed to.
1108 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1109     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1110   Value *CodeRegion = TheLoop->getHeader();
1111   DebugLoc DL = TheLoop->getStartLoc();
1112 
1113   if (I) {
1114     CodeRegion = I->getParent();
1115     // If there is no debug location attached to the instruction, revert back to
1116     // using the loop's.
1117     if (I->getDebugLoc())
1118       DL = I->getDebugLoc();
1119   }
1120 
1121   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1122   R << "loop not vectorized: ";
1123   return R;
1124 }
1125 
1126 /// Return a value for Step multiplied by VF.
1127 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1128   assert(isa<ConstantInt>(Step) && "Expected an integer step");
1129   Constant *StepVal = ConstantInt::get(
1130       Step->getType(),
1131       cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1132   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1133 }
1134 
1135 namespace llvm {
1136 
1137 void reportVectorizationFailure(const StringRef DebugMsg,
1138     const StringRef OREMsg, const StringRef ORETag,
1139     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1140   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
1141   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1142   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1143                 ORETag, TheLoop, I) << OREMsg);
1144 }
1145 
1146 } // end namespace llvm
1147 
1148 #ifndef NDEBUG
1149 /// \return string containing a file name and a line # for the given loop.
1150 static std::string getDebugLocString(const Loop *L) {
1151   std::string Result;
1152   if (L) {
1153     raw_string_ostream OS(Result);
1154     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1155       LoopDbgLoc.print(OS);
1156     else
1157       // Just print the module name.
1158       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1159     OS.flush();
1160   }
1161   return Result;
1162 }
1163 #endif
1164 
1165 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1166                                          const Instruction *Orig) {
1167   // If the loop was versioned with memchecks, add the corresponding no-alias
1168   // metadata.
1169   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1170     LVer->annotateInstWithNoAlias(To, Orig);
1171 }
1172 
1173 void InnerLoopVectorizer::addMetadata(Instruction *To,
1174                                       Instruction *From) {
1175   propagateMetadata(To, From);
1176   addNewMetadata(To, From);
1177 }
1178 
1179 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1180                                       Instruction *From) {
1181   for (Value *V : To) {
1182     if (Instruction *I = dyn_cast<Instruction>(V))
1183       addMetadata(I, From);
1184   }
1185 }
1186 
1187 namespace llvm {
1188 
1189 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1190 // lowered.
1191 enum ScalarEpilogueLowering {
1192 
1193   // The default: allowing scalar epilogues.
1194   CM_ScalarEpilogueAllowed,
1195 
1196   // Vectorization with OptForSize: don't allow epilogues.
1197   CM_ScalarEpilogueNotAllowedOptSize,
1198 
1199   // A special case of vectorisation with OptForSize: loops with a very small
1200   // trip count are considered for vectorization under OptForSize, thereby
1201   // making sure the cost of their loop body is dominant, free of runtime
1202   // guards and scalar iteration overheads.
1203   CM_ScalarEpilogueNotAllowedLowTripLoop,
1204 
1205   // Loop hint predicate indicating an epilogue is undesired.
1206   CM_ScalarEpilogueNotNeededUsePredicate,
1207 
1208   // Directive indicating we must either tail fold or not vectorize
1209   CM_ScalarEpilogueNotAllowedUsePredicate
1210 };
1211 
1212 /// LoopVectorizationCostModel - estimates the expected speedups due to
1213 /// vectorization.
1214 /// In many cases vectorization is not profitable. This can happen because of
1215 /// a number of reasons. In this class we mainly attempt to predict the
1216 /// expected speedup/slowdowns due to the supported instruction set. We use the
1217 /// TargetTransformInfo to query the different backends for the cost of
1218 /// different operations.
1219 class LoopVectorizationCostModel {
1220 public:
1221   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1222                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1223                              LoopVectorizationLegality *Legal,
1224                              const TargetTransformInfo &TTI,
1225                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1226                              AssumptionCache *AC,
1227                              OptimizationRemarkEmitter *ORE, const Function *F,
1228                              const LoopVectorizeHints *Hints,
1229                              InterleavedAccessInfo &IAI)
1230       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1231         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1232         Hints(Hints), InterleaveInfo(IAI) {}
1233 
1234   /// \return An upper bound for the vectorization factor, or None if
1235   /// vectorization and interleaving should be avoided up front.
1236   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1237 
1238   /// \return True if runtime checks are required for vectorization, and false
1239   /// otherwise.
1240   bool runtimeChecksRequired();
1241 
1242   /// \return The most profitable vectorization factor and the cost of that VF.
1243   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1244   /// then this vectorization factor will be selected if vectorization is
1245   /// possible.
1246   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1247   VectorizationFactor
1248   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1249                                     const LoopVectorizationPlanner &LVP);
1250 
1251   /// Setup cost-based decisions for user vectorization factor.
1252   void selectUserVectorizationFactor(ElementCount UserVF) {
1253     collectUniformsAndScalars(UserVF);
1254     collectInstsToScalarize(UserVF);
1255   }
1256 
1257   /// \return The size (in bits) of the smallest and widest types in the code
1258   /// that needs to be vectorized. We ignore values that remain scalar such as
1259   /// 64 bit loop indices.
1260   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1261 
1262   /// \return The desired interleave count.
1263   /// If interleave count has been specified by metadata it will be returned.
1264   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1265   /// are the selected vectorization factor and the cost of the selected VF.
1266   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1267 
1268   /// Memory access instruction may be vectorized in more than one way.
1269   /// Form of instruction after vectorization depends on cost.
1270   /// This function takes cost-based decisions for Load/Store instructions
1271   /// and collects them in a map. This decisions map is used for building
1272   /// the lists of loop-uniform and loop-scalar instructions.
1273   /// The calculated cost is saved with widening decision in order to
1274   /// avoid redundant calculations.
1275   void setCostBasedWideningDecision(ElementCount VF);
1276 
1277   /// A struct that represents some properties of the register usage
1278   /// of a loop.
1279   struct RegisterUsage {
1280     /// Holds the number of loop invariant values that are used in the loop.
1281     /// The key is ClassID of target-provided register class.
1282     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1283     /// Holds the maximum number of concurrent live intervals in the loop.
1284     /// The key is ClassID of target-provided register class.
1285     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1286   };
1287 
1288   /// \return Returns information about the register usages of the loop for the
1289   /// given vectorization factors.
1290   SmallVector<RegisterUsage, 8>
1291   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1292 
1293   /// Collect values we want to ignore in the cost model.
1294   void collectValuesToIgnore();
1295 
1296   /// Split reductions into those that happen in the loop, and those that happen
1297   /// outside. In loop reductions are collected into InLoopReductionChains.
1298   void collectInLoopReductions();
1299 
1300   /// \returns The smallest bitwidth each instruction can be represented with.
1301   /// The vector equivalents of these instructions should be truncated to this
1302   /// type.
1303   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1304     return MinBWs;
1305   }
1306 
1307   /// \returns True if it is more profitable to scalarize instruction \p I for
1308   /// vectorization factor \p VF.
1309   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1310     assert(VF.isVector() &&
1311            "Profitable to scalarize relevant only for VF > 1.");
1312 
1313     // Cost model is not run in the VPlan-native path - return conservative
1314     // result until this changes.
1315     if (EnableVPlanNativePath)
1316       return false;
1317 
1318     auto Scalars = InstsToScalarize.find(VF);
1319     assert(Scalars != InstsToScalarize.end() &&
1320            "VF not yet analyzed for scalarization profitability");
1321     return Scalars->second.find(I) != Scalars->second.end();
1322   }
1323 
1324   /// Returns true if \p I is known to be uniform after vectorization.
1325   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1326     if (VF.isScalar())
1327       return true;
1328 
1329     // Cost model is not run in the VPlan-native path - return conservative
1330     // result until this changes.
1331     if (EnableVPlanNativePath)
1332       return false;
1333 
1334     auto UniformsPerVF = Uniforms.find(VF);
1335     assert(UniformsPerVF != Uniforms.end() &&
1336            "VF not yet analyzed for uniformity");
1337     return UniformsPerVF->second.count(I);
1338   }
1339 
1340   /// Returns true if \p I is known to be scalar after vectorization.
1341   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1342     if (VF.isScalar())
1343       return true;
1344 
1345     // Cost model is not run in the VPlan-native path - return conservative
1346     // result until this changes.
1347     if (EnableVPlanNativePath)
1348       return false;
1349 
1350     auto ScalarsPerVF = Scalars.find(VF);
1351     assert(ScalarsPerVF != Scalars.end() &&
1352            "Scalar values are not calculated for VF");
1353     return ScalarsPerVF->second.count(I);
1354   }
1355 
1356   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1357   /// for vectorization factor \p VF.
1358   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1359     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1360            !isProfitableToScalarize(I, VF) &&
1361            !isScalarAfterVectorization(I, VF);
1362   }
1363 
1364   /// Decision that was taken during cost calculation for memory instruction.
1365   enum InstWidening {
1366     CM_Unknown,
1367     CM_Widen,         // For consecutive accesses with stride +1.
1368     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1369     CM_Interleave,
1370     CM_GatherScatter,
1371     CM_Scalarize
1372   };
1373 
1374   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1375   /// instruction \p I and vector width \p VF.
1376   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1377                            unsigned Cost) {
1378     assert(VF.isVector() && "Expected VF >=2");
1379     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1380   }
1381 
1382   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1383   /// interleaving group \p Grp and vector width \p VF.
1384   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1385                            ElementCount VF, InstWidening W, unsigned Cost) {
1386     assert(VF.isVector() && "Expected VF >=2");
1387     /// Broadcast this decicion to all instructions inside the group.
1388     /// But the cost will be assigned to one instruction only.
1389     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1390       if (auto *I = Grp->getMember(i)) {
1391         if (Grp->getInsertPos() == I)
1392           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1393         else
1394           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1395       }
1396     }
1397   }
1398 
1399   /// Return the cost model decision for the given instruction \p I and vector
1400   /// width \p VF. Return CM_Unknown if this instruction did not pass
1401   /// through the cost modeling.
1402   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1403     assert(VF.isVector() && "Expected VF to be a vector VF");
1404     // Cost model is not run in the VPlan-native path - return conservative
1405     // result until this changes.
1406     if (EnableVPlanNativePath)
1407       return CM_GatherScatter;
1408 
1409     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1410     auto Itr = WideningDecisions.find(InstOnVF);
1411     if (Itr == WideningDecisions.end())
1412       return CM_Unknown;
1413     return Itr->second.first;
1414   }
1415 
1416   /// Return the vectorization cost for the given instruction \p I and vector
1417   /// width \p VF.
1418   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1419     assert(VF.isVector() && "Expected VF >=2");
1420     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1421     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1422            "The cost is not calculated");
1423     return WideningDecisions[InstOnVF].second;
1424   }
1425 
1426   /// Return True if instruction \p I is an optimizable truncate whose operand
1427   /// is an induction variable. Such a truncate will be removed by adding a new
1428   /// induction variable with the destination type.
1429   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1430     // If the instruction is not a truncate, return false.
1431     auto *Trunc = dyn_cast<TruncInst>(I);
1432     if (!Trunc)
1433       return false;
1434 
1435     // Get the source and destination types of the truncate.
1436     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1437     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1438 
1439     // If the truncate is free for the given types, return false. Replacing a
1440     // free truncate with an induction variable would add an induction variable
1441     // update instruction to each iteration of the loop. We exclude from this
1442     // check the primary induction variable since it will need an update
1443     // instruction regardless.
1444     Value *Op = Trunc->getOperand(0);
1445     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1446       return false;
1447 
1448     // If the truncated value is not an induction variable, return false.
1449     return Legal->isInductionPhi(Op);
1450   }
1451 
1452   /// Collects the instructions to scalarize for each predicated instruction in
1453   /// the loop.
1454   void collectInstsToScalarize(ElementCount VF);
1455 
1456   /// Collect Uniform and Scalar values for the given \p VF.
1457   /// The sets depend on CM decision for Load/Store instructions
1458   /// that may be vectorized as interleave, gather-scatter or scalarized.
1459   void collectUniformsAndScalars(ElementCount VF) {
1460     // Do the analysis once.
1461     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1462       return;
1463     setCostBasedWideningDecision(VF);
1464     collectLoopUniforms(VF);
1465     collectLoopScalars(VF);
1466   }
1467 
1468   /// Returns true if the target machine supports masked store operation
1469   /// for the given \p DataType and kind of access to \p Ptr.
1470   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1471     return Legal->isConsecutivePtr(Ptr) &&
1472            TTI.isLegalMaskedStore(DataType, Alignment);
1473   }
1474 
1475   /// Returns true if the target machine supports masked load operation
1476   /// for the given \p DataType and kind of access to \p Ptr.
1477   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1478     return Legal->isConsecutivePtr(Ptr) &&
1479            TTI.isLegalMaskedLoad(DataType, Alignment);
1480   }
1481 
1482   /// Returns true if the target machine supports masked scatter operation
1483   /// for the given \p DataType.
1484   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1485     return TTI.isLegalMaskedScatter(DataType, Alignment);
1486   }
1487 
1488   /// Returns true if the target machine supports masked gather operation
1489   /// for the given \p DataType.
1490   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1491     return TTI.isLegalMaskedGather(DataType, Alignment);
1492   }
1493 
1494   /// Returns true if the target machine can represent \p V as a masked gather
1495   /// or scatter operation.
1496   bool isLegalGatherOrScatter(Value *V) {
1497     bool LI = isa<LoadInst>(V);
1498     bool SI = isa<StoreInst>(V);
1499     if (!LI && !SI)
1500       return false;
1501     auto *Ty = getMemInstValueType(V);
1502     Align Align = getLoadStoreAlignment(V);
1503     return (LI && isLegalMaskedGather(Ty, Align)) ||
1504            (SI && isLegalMaskedScatter(Ty, Align));
1505   }
1506 
1507   /// Returns true if \p I is an instruction that will be scalarized with
1508   /// predication. Such instructions include conditional stores and
1509   /// instructions that may divide by zero.
1510   /// If a non-zero VF has been calculated, we check if I will be scalarized
1511   /// predication for that VF.
1512   bool isScalarWithPredication(Instruction *I,
1513                                ElementCount VF = ElementCount::getFixed(1));
1514 
1515   // Returns true if \p I is an instruction that will be predicated either
1516   // through scalar predication or masked load/store or masked gather/scatter.
1517   // Superset of instructions that return true for isScalarWithPredication.
1518   bool isPredicatedInst(Instruction *I) {
1519     if (!blockNeedsPredication(I->getParent()))
1520       return false;
1521     // Loads and stores that need some form of masked operation are predicated
1522     // instructions.
1523     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1524       return Legal->isMaskRequired(I);
1525     return isScalarWithPredication(I);
1526   }
1527 
1528   /// Returns true if \p I is a memory instruction with consecutive memory
1529   /// access that can be widened.
1530   bool
1531   memoryInstructionCanBeWidened(Instruction *I,
1532                                 ElementCount VF = ElementCount::getFixed(1));
1533 
1534   /// Returns true if \p I is a memory instruction in an interleaved-group
1535   /// of memory accesses that can be vectorized with wide vector loads/stores
1536   /// and shuffles.
1537   bool
1538   interleavedAccessCanBeWidened(Instruction *I,
1539                                 ElementCount VF = ElementCount::getFixed(1));
1540 
1541   /// Check if \p Instr belongs to any interleaved access group.
1542   bool isAccessInterleaved(Instruction *Instr) {
1543     return InterleaveInfo.isInterleaved(Instr);
1544   }
1545 
1546   /// Get the interleaved access group that \p Instr belongs to.
1547   const InterleaveGroup<Instruction> *
1548   getInterleavedAccessGroup(Instruction *Instr) {
1549     return InterleaveInfo.getInterleaveGroup(Instr);
1550   }
1551 
1552   /// Returns true if we're required to use a scalar epilogue for at least
1553   /// the final iteration of the original loop.
1554   bool requiresScalarEpilogue() const {
1555     if (!isScalarEpilogueAllowed())
1556       return false;
1557     // If we might exit from anywhere but the latch, must run the exiting
1558     // iteration in scalar form.
1559     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1560       return true;
1561     return InterleaveInfo.requiresScalarEpilogue();
1562   }
1563 
1564   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1565   /// loop hint annotation.
1566   bool isScalarEpilogueAllowed() const {
1567     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1568   }
1569 
1570   /// Returns true if all loop blocks should be masked to fold tail loop.
1571   bool foldTailByMasking() const { return FoldTailByMasking; }
1572 
1573   bool blockNeedsPredication(BasicBlock *BB) {
1574     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1575   }
1576 
1577   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1578   /// nodes to the chain of instructions representing the reductions. Uses a
1579   /// MapVector to ensure deterministic iteration order.
1580   using ReductionChainMap =
1581       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1582 
1583   /// Return the chain of instructions representing an inloop reduction.
1584   const ReductionChainMap &getInLoopReductionChains() const {
1585     return InLoopReductionChains;
1586   }
1587 
1588   /// Returns true if the Phi is part of an inloop reduction.
1589   bool isInLoopReduction(PHINode *Phi) const {
1590     return InLoopReductionChains.count(Phi);
1591   }
1592 
1593   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1594   /// with factor VF.  Return the cost of the instruction, including
1595   /// scalarization overhead if it's needed.
1596   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1597 
1598   /// Estimate cost of a call instruction CI if it were vectorized with factor
1599   /// VF. Return the cost of the instruction, including scalarization overhead
1600   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1601   /// scalarized -
1602   /// i.e. either vector version isn't available, or is too expensive.
1603   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1604                              bool &NeedToScalarize);
1605 
1606   /// Invalidates decisions already taken by the cost model.
1607   void invalidateCostModelingDecisions() {
1608     WideningDecisions.clear();
1609     Uniforms.clear();
1610     Scalars.clear();
1611   }
1612 
1613 private:
1614   unsigned NumPredStores = 0;
1615 
1616   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1617   /// than zero. One is returned if vectorization should best be avoided due
1618   /// to cost.
1619   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1620                                     ElementCount UserVF);
1621 
1622   /// The vectorization cost is a combination of the cost itself and a boolean
1623   /// indicating whether any of the contributing operations will actually
1624   /// operate on
1625   /// vector values after type legalization in the backend. If this latter value
1626   /// is
1627   /// false, then all operations will be scalarized (i.e. no vectorization has
1628   /// actually taken place).
1629   using VectorizationCostTy = std::pair<unsigned, bool>;
1630 
1631   /// Returns the expected execution cost. The unit of the cost does
1632   /// not matter because we use the 'cost' units to compare different
1633   /// vector widths. The cost that is returned is *not* normalized by
1634   /// the factor width.
1635   VectorizationCostTy expectedCost(ElementCount VF);
1636 
1637   /// Returns the execution time cost of an instruction for a given vector
1638   /// width. Vector width of one means scalar.
1639   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1640 
1641   /// The cost-computation logic from getInstructionCost which provides
1642   /// the vector type as an output parameter.
1643   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1644 
1645   /// Calculate vectorization cost of memory instruction \p I.
1646   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1647 
1648   /// The cost computation for scalarized memory instruction.
1649   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1650 
1651   /// The cost computation for interleaving group of memory instructions.
1652   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1653 
1654   /// The cost computation for Gather/Scatter instruction.
1655   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1656 
1657   /// The cost computation for widening instruction \p I with consecutive
1658   /// memory access.
1659   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1660 
1661   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1662   /// Load: scalar load + broadcast.
1663   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1664   /// element)
1665   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1666 
1667   /// Estimate the overhead of scalarizing an instruction. This is a
1668   /// convenience wrapper for the type-based getScalarizationOverhead API.
1669   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1670 
1671   /// Returns whether the instruction is a load or store and will be a emitted
1672   /// as a vector operation.
1673   bool isConsecutiveLoadOrStore(Instruction *I);
1674 
1675   /// Returns true if an artificially high cost for emulated masked memrefs
1676   /// should be used.
1677   bool useEmulatedMaskMemRefHack(Instruction *I);
1678 
1679   /// Map of scalar integer values to the smallest bitwidth they can be legally
1680   /// represented as. The vector equivalents of these values should be truncated
1681   /// to this type.
1682   MapVector<Instruction *, uint64_t> MinBWs;
1683 
1684   /// A type representing the costs for instructions if they were to be
1685   /// scalarized rather than vectorized. The entries are Instruction-Cost
1686   /// pairs.
1687   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1688 
1689   /// A set containing all BasicBlocks that are known to present after
1690   /// vectorization as a predicated block.
1691   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1692 
1693   /// Records whether it is allowed to have the original scalar loop execute at
1694   /// least once. This may be needed as a fallback loop in case runtime
1695   /// aliasing/dependence checks fail, or to handle the tail/remainder
1696   /// iterations when the trip count is unknown or doesn't divide by the VF,
1697   /// or as a peel-loop to handle gaps in interleave-groups.
1698   /// Under optsize and when the trip count is very small we don't allow any
1699   /// iterations to execute in the scalar loop.
1700   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1701 
1702   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1703   bool FoldTailByMasking = false;
1704 
1705   /// A map holding scalar costs for different vectorization factors. The
1706   /// presence of a cost for an instruction in the mapping indicates that the
1707   /// instruction will be scalarized when vectorizing with the associated
1708   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1709   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1710 
1711   /// Holds the instructions known to be uniform after vectorization.
1712   /// The data is collected per VF.
1713   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1714 
1715   /// Holds the instructions known to be scalar after vectorization.
1716   /// The data is collected per VF.
1717   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1718 
1719   /// Holds the instructions (address computations) that are forced to be
1720   /// scalarized.
1721   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1722 
1723   /// PHINodes of the reductions that should be expanded in-loop along with
1724   /// their associated chains of reduction operations, in program order from top
1725   /// (PHI) to bottom
1726   ReductionChainMap InLoopReductionChains;
1727 
1728   /// Returns the expected difference in cost from scalarizing the expression
1729   /// feeding a predicated instruction \p PredInst. The instructions to
1730   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1731   /// non-negative return value implies the expression will be scalarized.
1732   /// Currently, only single-use chains are considered for scalarization.
1733   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1734                               ElementCount VF);
1735 
1736   /// Collect the instructions that are uniform after vectorization. An
1737   /// instruction is uniform if we represent it with a single scalar value in
1738   /// the vectorized loop corresponding to each vector iteration. Examples of
1739   /// uniform instructions include pointer operands of consecutive or
1740   /// interleaved memory accesses. Note that although uniformity implies an
1741   /// instruction will be scalar, the reverse is not true. In general, a
1742   /// scalarized instruction will be represented by VF scalar values in the
1743   /// vectorized loop, each corresponding to an iteration of the original
1744   /// scalar loop.
1745   void collectLoopUniforms(ElementCount VF);
1746 
1747   /// Collect the instructions that are scalar after vectorization. An
1748   /// instruction is scalar if it is known to be uniform or will be scalarized
1749   /// during vectorization. Non-uniform scalarized instructions will be
1750   /// represented by VF values in the vectorized loop, each corresponding to an
1751   /// iteration of the original scalar loop.
1752   void collectLoopScalars(ElementCount VF);
1753 
1754   /// Keeps cost model vectorization decision and cost for instructions.
1755   /// Right now it is used for memory instructions only.
1756   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1757                                 std::pair<InstWidening, unsigned>>;
1758 
1759   DecisionList WideningDecisions;
1760 
1761   /// Returns true if \p V is expected to be vectorized and it needs to be
1762   /// extracted.
1763   bool needsExtract(Value *V, ElementCount VF) const {
1764     Instruction *I = dyn_cast<Instruction>(V);
1765     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1766         TheLoop->isLoopInvariant(I))
1767       return false;
1768 
1769     // Assume we can vectorize V (and hence we need extraction) if the
1770     // scalars are not computed yet. This can happen, because it is called
1771     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1772     // the scalars are collected. That should be a safe assumption in most
1773     // cases, because we check if the operands have vectorizable types
1774     // beforehand in LoopVectorizationLegality.
1775     return Scalars.find(VF) == Scalars.end() ||
1776            !isScalarAfterVectorization(I, VF);
1777   };
1778 
1779   /// Returns a range containing only operands needing to be extracted.
1780   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1781                                                    ElementCount VF) {
1782     return SmallVector<Value *, 4>(make_filter_range(
1783         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1784   }
1785 
1786   /// Determines if we have the infrastructure to vectorize loop \p L and its
1787   /// epilogue, assuming the main loop is vectorized by \p VF.
1788   bool isCandidateForEpilogueVectorization(const Loop &L,
1789                                            const ElementCount VF) const;
1790 
1791   /// Returns true if epilogue vectorization is considered profitable, and
1792   /// false otherwise.
1793   /// \p VF is the vectorization factor chosen for the original loop.
1794   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1795 
1796 public:
1797   /// The loop that we evaluate.
1798   Loop *TheLoop;
1799 
1800   /// Predicated scalar evolution analysis.
1801   PredicatedScalarEvolution &PSE;
1802 
1803   /// Loop Info analysis.
1804   LoopInfo *LI;
1805 
1806   /// Vectorization legality.
1807   LoopVectorizationLegality *Legal;
1808 
1809   /// Vector target information.
1810   const TargetTransformInfo &TTI;
1811 
1812   /// Target Library Info.
1813   const TargetLibraryInfo *TLI;
1814 
1815   /// Demanded bits analysis.
1816   DemandedBits *DB;
1817 
1818   /// Assumption cache.
1819   AssumptionCache *AC;
1820 
1821   /// Interface to emit optimization remarks.
1822   OptimizationRemarkEmitter *ORE;
1823 
1824   const Function *TheFunction;
1825 
1826   /// Loop Vectorize Hint.
1827   const LoopVectorizeHints *Hints;
1828 
1829   /// The interleave access information contains groups of interleaved accesses
1830   /// with the same stride and close to each other.
1831   InterleavedAccessInfo &InterleaveInfo;
1832 
1833   /// Values to ignore in the cost model.
1834   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1835 
1836   /// Values to ignore in the cost model when VF > 1.
1837   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1838 
1839   /// Profitable vector factors.
1840   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1841 };
1842 
1843 } // end namespace llvm
1844 
1845 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1846 // vectorization. The loop needs to be annotated with #pragma omp simd
1847 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1848 // vector length information is not provided, vectorization is not considered
1849 // explicit. Interleave hints are not allowed either. These limitations will be
1850 // relaxed in the future.
1851 // Please, note that we are currently forced to abuse the pragma 'clang
1852 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1853 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1854 // provides *explicit vectorization hints* (LV can bypass legal checks and
1855 // assume that vectorization is legal). However, both hints are implemented
1856 // using the same metadata (llvm.loop.vectorize, processed by
1857 // LoopVectorizeHints). This will be fixed in the future when the native IR
1858 // representation for pragma 'omp simd' is introduced.
1859 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1860                                    OptimizationRemarkEmitter *ORE) {
1861   assert(!OuterLp->isInnermost() && "This is not an outer loop");
1862   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1863 
1864   // Only outer loops with an explicit vectorization hint are supported.
1865   // Unannotated outer loops are ignored.
1866   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1867     return false;
1868 
1869   Function *Fn = OuterLp->getHeader()->getParent();
1870   if (!Hints.allowVectorization(Fn, OuterLp,
1871                                 true /*VectorizeOnlyWhenForced*/)) {
1872     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1873     return false;
1874   }
1875 
1876   if (Hints.getInterleave() > 1) {
1877     // TODO: Interleave support is future work.
1878     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1879                          "outer loops.\n");
1880     Hints.emitRemarkWithHints();
1881     return false;
1882   }
1883 
1884   return true;
1885 }
1886 
1887 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1888                                   OptimizationRemarkEmitter *ORE,
1889                                   SmallVectorImpl<Loop *> &V) {
1890   // Collect inner loops and outer loops without irreducible control flow. For
1891   // now, only collect outer loops that have explicit vectorization hints. If we
1892   // are stress testing the VPlan H-CFG construction, we collect the outermost
1893   // loop of every loop nest.
1894   if (L.isInnermost() || VPlanBuildStressTest ||
1895       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1896     LoopBlocksRPO RPOT(&L);
1897     RPOT.perform(LI);
1898     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1899       V.push_back(&L);
1900       // TODO: Collect inner loops inside marked outer loops in case
1901       // vectorization fails for the outer loop. Do not invoke
1902       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1903       // already known to be reducible. We can use an inherited attribute for
1904       // that.
1905       return;
1906     }
1907   }
1908   for (Loop *InnerL : L)
1909     collectSupportedLoops(*InnerL, LI, ORE, V);
1910 }
1911 
1912 namespace {
1913 
1914 /// The LoopVectorize Pass.
1915 struct LoopVectorize : public FunctionPass {
1916   /// Pass identification, replacement for typeid
1917   static char ID;
1918 
1919   LoopVectorizePass Impl;
1920 
1921   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1922                          bool VectorizeOnlyWhenForced = false)
1923       : FunctionPass(ID),
1924         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1925     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1926   }
1927 
1928   bool runOnFunction(Function &F) override {
1929     if (skipFunction(F))
1930       return false;
1931 
1932     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1933     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1934     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1935     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1936     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1937     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1938     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1939     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1940     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1941     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1942     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1943     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1944     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1945 
1946     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1947         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1948 
1949     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1950                         GetLAA, *ORE, PSI).MadeAnyChange;
1951   }
1952 
1953   void getAnalysisUsage(AnalysisUsage &AU) const override {
1954     AU.addRequired<AssumptionCacheTracker>();
1955     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1956     AU.addRequired<DominatorTreeWrapperPass>();
1957     AU.addRequired<LoopInfoWrapperPass>();
1958     AU.addRequired<ScalarEvolutionWrapperPass>();
1959     AU.addRequired<TargetTransformInfoWrapperPass>();
1960     AU.addRequired<AAResultsWrapperPass>();
1961     AU.addRequired<LoopAccessLegacyAnalysis>();
1962     AU.addRequired<DemandedBitsWrapperPass>();
1963     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1964     AU.addRequired<InjectTLIMappingsLegacy>();
1965 
1966     // We currently do not preserve loopinfo/dominator analyses with outer loop
1967     // vectorization. Until this is addressed, mark these analyses as preserved
1968     // only for non-VPlan-native path.
1969     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1970     if (!EnableVPlanNativePath) {
1971       AU.addPreserved<LoopInfoWrapperPass>();
1972       AU.addPreserved<DominatorTreeWrapperPass>();
1973     }
1974 
1975     AU.addPreserved<BasicAAWrapperPass>();
1976     AU.addPreserved<GlobalsAAWrapperPass>();
1977     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1978   }
1979 };
1980 
1981 } // end anonymous namespace
1982 
1983 //===----------------------------------------------------------------------===//
1984 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1985 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1986 //===----------------------------------------------------------------------===//
1987 
1988 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1989   // We need to place the broadcast of invariant variables outside the loop,
1990   // but only if it's proven safe to do so. Else, broadcast will be inside
1991   // vector loop body.
1992   Instruction *Instr = dyn_cast<Instruction>(V);
1993   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1994                      (!Instr ||
1995                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1996   // Place the code for broadcasting invariant variables in the new preheader.
1997   IRBuilder<>::InsertPointGuard Guard(Builder);
1998   if (SafeToHoist)
1999     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2000 
2001   // Broadcast the scalar into all locations in the vector.
2002   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2003 
2004   return Shuf;
2005 }
2006 
2007 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2008     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
2009   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2010          "Expected either an induction phi-node or a truncate of it!");
2011   Value *Start = II.getStartValue();
2012 
2013   // Construct the initial value of the vector IV in the vector loop preheader
2014   auto CurrIP = Builder.saveIP();
2015   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2016   if (isa<TruncInst>(EntryVal)) {
2017     assert(Start->getType()->isIntegerTy() &&
2018            "Truncation requires an integer type");
2019     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2020     Step = Builder.CreateTrunc(Step, TruncType);
2021     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2022   }
2023   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2024   Value *SteppedStart =
2025       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2026 
2027   // We create vector phi nodes for both integer and floating-point induction
2028   // variables. Here, we determine the kind of arithmetic we will perform.
2029   Instruction::BinaryOps AddOp;
2030   Instruction::BinaryOps MulOp;
2031   if (Step->getType()->isIntegerTy()) {
2032     AddOp = Instruction::Add;
2033     MulOp = Instruction::Mul;
2034   } else {
2035     AddOp = II.getInductionOpcode();
2036     MulOp = Instruction::FMul;
2037   }
2038 
2039   // Multiply the vectorization factor by the step using integer or
2040   // floating-point arithmetic as appropriate.
2041   Value *ConstVF =
2042       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
2043   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2044 
2045   // Create a vector splat to use in the induction update.
2046   //
2047   // FIXME: If the step is non-constant, we create the vector splat with
2048   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2049   //        handle a constant vector splat.
2050   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2051   Value *SplatVF = isa<Constant>(Mul)
2052                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2053                        : Builder.CreateVectorSplat(VF, Mul);
2054   Builder.restoreIP(CurrIP);
2055 
2056   // We may need to add the step a number of times, depending on the unroll
2057   // factor. The last of those goes into the PHI.
2058   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2059                                     &*LoopVectorBody->getFirstInsertionPt());
2060   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2061   Instruction *LastInduction = VecInd;
2062   for (unsigned Part = 0; Part < UF; ++Part) {
2063     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2064 
2065     if (isa<TruncInst>(EntryVal))
2066       addMetadata(LastInduction, EntryVal);
2067     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
2068 
2069     LastInduction = cast<Instruction>(addFastMathFlag(
2070         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2071     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2072   }
2073 
2074   // Move the last step to the end of the latch block. This ensures consistent
2075   // placement of all induction updates.
2076   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2077   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2078   auto *ICmp = cast<Instruction>(Br->getCondition());
2079   LastInduction->moveBefore(ICmp);
2080   LastInduction->setName("vec.ind.next");
2081 
2082   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2083   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2084 }
2085 
2086 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2087   return Cost->isScalarAfterVectorization(I, VF) ||
2088          Cost->isProfitableToScalarize(I, VF);
2089 }
2090 
2091 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2092   if (shouldScalarizeInstruction(IV))
2093     return true;
2094   auto isScalarInst = [&](User *U) -> bool {
2095     auto *I = cast<Instruction>(U);
2096     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2097   };
2098   return llvm::any_of(IV->users(), isScalarInst);
2099 }
2100 
2101 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2102     const InductionDescriptor &ID, const Instruction *EntryVal,
2103     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
2104   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2105          "Expected either an induction phi-node or a truncate of it!");
2106 
2107   // This induction variable is not the phi from the original loop but the
2108   // newly-created IV based on the proof that casted Phi is equal to the
2109   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2110   // re-uses the same InductionDescriptor that original IV uses but we don't
2111   // have to do any recording in this case - that is done when original IV is
2112   // processed.
2113   if (isa<TruncInst>(EntryVal))
2114     return;
2115 
2116   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2117   if (Casts.empty())
2118     return;
2119   // Only the first Cast instruction in the Casts vector is of interest.
2120   // The rest of the Casts (if exist) have no uses outside the
2121   // induction update chain itself.
2122   Instruction *CastInst = *Casts.begin();
2123   if (Lane < UINT_MAX)
2124     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
2125   else
2126     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
2127 }
2128 
2129 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
2130   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2131          "Primary induction variable must have an integer type");
2132 
2133   auto II = Legal->getInductionVars().find(IV);
2134   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2135 
2136   auto ID = II->second;
2137   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2138 
2139   // The value from the original loop to which we are mapping the new induction
2140   // variable.
2141   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2142 
2143   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2144 
2145   // Generate code for the induction step. Note that induction steps are
2146   // required to be loop-invariant
2147   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2148     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2149            "Induction step should be loop invariant");
2150     if (PSE.getSE()->isSCEVable(IV->getType())) {
2151       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2152       return Exp.expandCodeFor(Step, Step->getType(),
2153                                LoopVectorPreHeader->getTerminator());
2154     }
2155     return cast<SCEVUnknown>(Step)->getValue();
2156   };
2157 
2158   // The scalar value to broadcast. This is derived from the canonical
2159   // induction variable. If a truncation type is given, truncate the canonical
2160   // induction variable and step. Otherwise, derive these values from the
2161   // induction descriptor.
2162   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2163     Value *ScalarIV = Induction;
2164     if (IV != OldInduction) {
2165       ScalarIV = IV->getType()->isIntegerTy()
2166                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2167                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2168                                           IV->getType());
2169       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2170       ScalarIV->setName("offset.idx");
2171     }
2172     if (Trunc) {
2173       auto *TruncType = cast<IntegerType>(Trunc->getType());
2174       assert(Step->getType()->isIntegerTy() &&
2175              "Truncation requires an integer step");
2176       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2177       Step = Builder.CreateTrunc(Step, TruncType);
2178     }
2179     return ScalarIV;
2180   };
2181 
2182   // Create the vector values from the scalar IV, in the absence of creating a
2183   // vector IV.
2184   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2185     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2186     for (unsigned Part = 0; Part < UF; ++Part) {
2187       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2188       Value *EntryPart =
2189           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2190                         ID.getInductionOpcode());
2191       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2192       if (Trunc)
2193         addMetadata(EntryPart, Trunc);
2194       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2195     }
2196   };
2197 
2198   // Now do the actual transformations, and start with creating the step value.
2199   Value *Step = CreateStepValue(ID.getStep());
2200   if (VF.isZero() || VF.isScalar()) {
2201     Value *ScalarIV = CreateScalarIV(Step);
2202     CreateSplatIV(ScalarIV, Step);
2203     return;
2204   }
2205 
2206   // Determine if we want a scalar version of the induction variable. This is
2207   // true if the induction variable itself is not widened, or if it has at
2208   // least one user in the loop that is not widened.
2209   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2210   if (!NeedsScalarIV) {
2211     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2212     return;
2213   }
2214 
2215   // Try to create a new independent vector induction variable. If we can't
2216   // create the phi node, we will splat the scalar induction variable in each
2217   // loop iteration.
2218   if (!shouldScalarizeInstruction(EntryVal)) {
2219     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2220     Value *ScalarIV = CreateScalarIV(Step);
2221     // Create scalar steps that can be used by instructions we will later
2222     // scalarize. Note that the addition of the scalar steps will not increase
2223     // the number of instructions in the loop in the common case prior to
2224     // InstCombine. We will be trading one vector extract for each scalar step.
2225     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2226     return;
2227   }
2228 
2229   // All IV users are scalar instructions, so only emit a scalar IV, not a
2230   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2231   // predicate used by the masked loads/stores.
2232   Value *ScalarIV = CreateScalarIV(Step);
2233   if (!Cost->isScalarEpilogueAllowed())
2234     CreateSplatIV(ScalarIV, Step);
2235   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2236 }
2237 
2238 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2239                                           Instruction::BinaryOps BinOp) {
2240   // Create and check the types.
2241   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2242   int VLen = ValVTy->getNumElements();
2243 
2244   Type *STy = Val->getType()->getScalarType();
2245   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2246          "Induction Step must be an integer or FP");
2247   assert(Step->getType() == STy && "Step has wrong type");
2248 
2249   SmallVector<Constant *, 8> Indices;
2250 
2251   if (STy->isIntegerTy()) {
2252     // Create a vector of consecutive numbers from zero to VF.
2253     for (int i = 0; i < VLen; ++i)
2254       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2255 
2256     // Add the consecutive indices to the vector value.
2257     Constant *Cv = ConstantVector::get(Indices);
2258     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2259     Step = Builder.CreateVectorSplat(VLen, Step);
2260     assert(Step->getType() == Val->getType() && "Invalid step vec");
2261     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2262     // which can be found from the original scalar operations.
2263     Step = Builder.CreateMul(Cv, Step);
2264     return Builder.CreateAdd(Val, Step, "induction");
2265   }
2266 
2267   // Floating point induction.
2268   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2269          "Binary Opcode should be specified for FP induction");
2270   // Create a vector of consecutive numbers from zero to VF.
2271   for (int i = 0; i < VLen; ++i)
2272     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2273 
2274   // Add the consecutive indices to the vector value.
2275   Constant *Cv = ConstantVector::get(Indices);
2276 
2277   Step = Builder.CreateVectorSplat(VLen, Step);
2278 
2279   // Floating point operations had to be 'fast' to enable the induction.
2280   FastMathFlags Flags;
2281   Flags.setFast();
2282 
2283   Value *MulOp = Builder.CreateFMul(Cv, Step);
2284   if (isa<Instruction>(MulOp))
2285     // Have to check, MulOp may be a constant
2286     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2287 
2288   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2289   if (isa<Instruction>(BOp))
2290     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2291   return BOp;
2292 }
2293 
2294 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2295                                            Instruction *EntryVal,
2296                                            const InductionDescriptor &ID) {
2297   // We shouldn't have to build scalar steps if we aren't vectorizing.
2298   assert(VF.isVector() && "VF should be greater than one");
2299   // Get the value type and ensure it and the step have the same integer type.
2300   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2301   assert(ScalarIVTy == Step->getType() &&
2302          "Val and Step should have the same type");
2303 
2304   // We build scalar steps for both integer and floating-point induction
2305   // variables. Here, we determine the kind of arithmetic we will perform.
2306   Instruction::BinaryOps AddOp;
2307   Instruction::BinaryOps MulOp;
2308   if (ScalarIVTy->isIntegerTy()) {
2309     AddOp = Instruction::Add;
2310     MulOp = Instruction::Mul;
2311   } else {
2312     AddOp = ID.getInductionOpcode();
2313     MulOp = Instruction::FMul;
2314   }
2315 
2316   // Determine the number of scalars we need to generate for each unroll
2317   // iteration. If EntryVal is uniform, we only need to generate the first
2318   // lane. Otherwise, we generate all VF values.
2319   unsigned Lanes =
2320       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2321           ? 1
2322           : VF.getKnownMinValue();
2323   assert((!VF.isScalable() || Lanes == 1) &&
2324          "Should never scalarize a scalable vector");
2325   // Compute the scalar steps and save the results in VectorLoopValueMap.
2326   for (unsigned Part = 0; Part < UF; ++Part) {
2327     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2328       auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2329                                          ScalarIVTy->getScalarSizeInBits());
2330       Value *StartIdx =
2331           createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2332       if (ScalarIVTy->isFloatingPointTy())
2333         StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
2334       StartIdx = addFastMathFlag(Builder.CreateBinOp(
2335           AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
2336       // The step returned by `createStepForVF` is a runtime-evaluated value
2337       // when VF is scalable. Otherwise, it should be folded into a Constant.
2338       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2339              "Expected StartIdx to be folded to a constant when VF is not "
2340              "scalable");
2341       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2342       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2343       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2344       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2345     }
2346   }
2347 }
2348 
2349 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2350   assert(V != Induction && "The new induction variable should not be used.");
2351   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2352   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2353 
2354   // If we have a stride that is replaced by one, do it here. Defer this for
2355   // the VPlan-native path until we start running Legal checks in that path.
2356   if (!EnableVPlanNativePath && Legal->hasStride(V))
2357     V = ConstantInt::get(V->getType(), 1);
2358 
2359   // If we have a vector mapped to this value, return it.
2360   if (VectorLoopValueMap.hasVectorValue(V, Part))
2361     return VectorLoopValueMap.getVectorValue(V, Part);
2362 
2363   // If the value has not been vectorized, check if it has been scalarized
2364   // instead. If it has been scalarized, and we actually need the value in
2365   // vector form, we will construct the vector values on demand.
2366   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2367     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2368 
2369     // If we've scalarized a value, that value should be an instruction.
2370     auto *I = cast<Instruction>(V);
2371 
2372     // If we aren't vectorizing, we can just copy the scalar map values over to
2373     // the vector map.
2374     if (VF.isScalar()) {
2375       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2376       return ScalarValue;
2377     }
2378 
2379     // Get the last scalar instruction we generated for V and Part. If the value
2380     // is known to be uniform after vectorization, this corresponds to lane zero
2381     // of the Part unroll iteration. Otherwise, the last instruction is the one
2382     // we created for the last vector lane of the Part unroll iteration.
2383     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2384                             ? 0
2385                             : VF.getKnownMinValue() - 1;
2386     assert((!VF.isScalable() || LastLane == 0) &&
2387            "Scalable vectorization can't lead to any scalarized values.");
2388     auto *LastInst = cast<Instruction>(
2389         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2390 
2391     // Set the insert point after the last scalarized instruction. This ensures
2392     // the insertelement sequence will directly follow the scalar definitions.
2393     auto OldIP = Builder.saveIP();
2394     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2395     Builder.SetInsertPoint(&*NewIP);
2396 
2397     // However, if we are vectorizing, we need to construct the vector values.
2398     // If the value is known to be uniform after vectorization, we can just
2399     // broadcast the scalar value corresponding to lane zero for each unroll
2400     // iteration. Otherwise, we construct the vector values using insertelement
2401     // instructions. Since the resulting vectors are stored in
2402     // VectorLoopValueMap, we will only generate the insertelements once.
2403     Value *VectorValue = nullptr;
2404     if (Cost->isUniformAfterVectorization(I, VF)) {
2405       VectorValue = getBroadcastInstrs(ScalarValue);
2406       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2407     } else {
2408       // Initialize packing with insertelements to start from undef.
2409       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2410       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2411       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2412       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2413         packScalarIntoVectorValue(V, {Part, Lane});
2414       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2415     }
2416     Builder.restoreIP(OldIP);
2417     return VectorValue;
2418   }
2419 
2420   // If this scalar is unknown, assume that it is a constant or that it is
2421   // loop invariant. Broadcast V and save the value for future uses.
2422   Value *B = getBroadcastInstrs(V);
2423   VectorLoopValueMap.setVectorValue(V, Part, B);
2424   return B;
2425 }
2426 
2427 Value *
2428 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2429                                             const VPIteration &Instance) {
2430   // If the value is not an instruction contained in the loop, it should
2431   // already be scalar.
2432   if (OrigLoop->isLoopInvariant(V))
2433     return V;
2434 
2435   assert(Instance.Lane > 0
2436              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2437              : true && "Uniform values only have lane zero");
2438 
2439   // If the value from the original loop has not been vectorized, it is
2440   // represented by UF x VF scalar values in the new loop. Return the requested
2441   // scalar value.
2442   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2443     return VectorLoopValueMap.getScalarValue(V, Instance);
2444 
2445   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2446   // for the given unroll part. If this entry is not a vector type (i.e., the
2447   // vectorization factor is one), there is no need to generate an
2448   // extractelement instruction.
2449   auto *U = getOrCreateVectorValue(V, Instance.Part);
2450   if (!U->getType()->isVectorTy()) {
2451     assert(VF.isScalar() && "Value not scalarized has non-vector type");
2452     return U;
2453   }
2454 
2455   // Otherwise, the value from the original loop has been vectorized and is
2456   // represented by UF vector values. Extract and return the requested scalar
2457   // value from the appropriate vector lane.
2458   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2459 }
2460 
2461 void InnerLoopVectorizer::packScalarIntoVectorValue(
2462     Value *V, const VPIteration &Instance) {
2463   assert(V != Induction && "The new induction variable should not be used.");
2464   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2465   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2466 
2467   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2468   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2469   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2470                                             Builder.getInt32(Instance.Lane));
2471   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2472 }
2473 
2474 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2475   assert(Vec->getType()->isVectorTy() && "Invalid type");
2476   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2477   SmallVector<int, 8> ShuffleMask;
2478   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2479     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2480 
2481   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2482 }
2483 
2484 // Return whether we allow using masked interleave-groups (for dealing with
2485 // strided loads/stores that reside in predicated blocks, or for dealing
2486 // with gaps).
2487 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2488   // If an override option has been passed in for interleaved accesses, use it.
2489   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2490     return EnableMaskedInterleavedMemAccesses;
2491 
2492   return TTI.enableMaskedInterleavedAccessVectorization();
2493 }
2494 
2495 // Try to vectorize the interleave group that \p Instr belongs to.
2496 //
2497 // E.g. Translate following interleaved load group (factor = 3):
2498 //   for (i = 0; i < N; i+=3) {
2499 //     R = Pic[i];             // Member of index 0
2500 //     G = Pic[i+1];           // Member of index 1
2501 //     B = Pic[i+2];           // Member of index 2
2502 //     ... // do something to R, G, B
2503 //   }
2504 // To:
2505 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2506 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2507 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2508 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2509 //
2510 // Or translate following interleaved store group (factor = 3):
2511 //   for (i = 0; i < N; i+=3) {
2512 //     ... do something to R, G, B
2513 //     Pic[i]   = R;           // Member of index 0
2514 //     Pic[i+1] = G;           // Member of index 1
2515 //     Pic[i+2] = B;           // Member of index 2
2516 //   }
2517 // To:
2518 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2519 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2520 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2521 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2522 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2523 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2524     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2525     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2526     VPValue *BlockInMask) {
2527   Instruction *Instr = Group->getInsertPos();
2528   const DataLayout &DL = Instr->getModule()->getDataLayout();
2529 
2530   // Prepare for the vector type of the interleaved load/store.
2531   Type *ScalarTy = getMemInstValueType(Instr);
2532   unsigned InterleaveFactor = Group->getFactor();
2533   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2534   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2535 
2536   // Prepare for the new pointers.
2537   SmallVector<Value *, 2> AddrParts;
2538   unsigned Index = Group->getIndex(Instr);
2539 
2540   // TODO: extend the masked interleaved-group support to reversed access.
2541   assert((!BlockInMask || !Group->isReverse()) &&
2542          "Reversed masked interleave-group not supported.");
2543 
2544   // If the group is reverse, adjust the index to refer to the last vector lane
2545   // instead of the first. We adjust the index from the first vector lane,
2546   // rather than directly getting the pointer for lane VF - 1, because the
2547   // pointer operand of the interleaved access is supposed to be uniform. For
2548   // uniform instructions, we're only required to generate a value for the
2549   // first vector lane in each unroll iteration.
2550   assert(!VF.isScalable() &&
2551          "scalable vector reverse operation is not implemented");
2552   if (Group->isReverse())
2553     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2554 
2555   for (unsigned Part = 0; Part < UF; Part++) {
2556     Value *AddrPart = State.get(Addr, {Part, 0});
2557     setDebugLocFromInst(Builder, AddrPart);
2558 
2559     // Notice current instruction could be any index. Need to adjust the address
2560     // to the member of index 0.
2561     //
2562     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2563     //       b = A[i];       // Member of index 0
2564     // Current pointer is pointed to A[i+1], adjust it to A[i].
2565     //
2566     // E.g.  A[i+1] = a;     // Member of index 1
2567     //       A[i]   = b;     // Member of index 0
2568     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2569     // Current pointer is pointed to A[i+2], adjust it to A[i].
2570 
2571     bool InBounds = false;
2572     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2573       InBounds = gep->isInBounds();
2574     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2575     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2576 
2577     // Cast to the vector pointer type.
2578     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2579     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2580     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2581   }
2582 
2583   setDebugLocFromInst(Builder, Instr);
2584   Value *UndefVec = UndefValue::get(VecTy);
2585 
2586   Value *MaskForGaps = nullptr;
2587   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2588     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2589     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2590     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2591   }
2592 
2593   // Vectorize the interleaved load group.
2594   if (isa<LoadInst>(Instr)) {
2595     // For each unroll part, create a wide load for the group.
2596     SmallVector<Value *, 2> NewLoads;
2597     for (unsigned Part = 0; Part < UF; Part++) {
2598       Instruction *NewLoad;
2599       if (BlockInMask || MaskForGaps) {
2600         assert(useMaskedInterleavedAccesses(*TTI) &&
2601                "masked interleaved groups are not allowed.");
2602         Value *GroupMask = MaskForGaps;
2603         if (BlockInMask) {
2604           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2605           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2606           Value *ShuffledMask = Builder.CreateShuffleVector(
2607               BlockInMaskPart,
2608               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2609               "interleaved.mask");
2610           GroupMask = MaskForGaps
2611                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2612                                                 MaskForGaps)
2613                           : ShuffledMask;
2614         }
2615         NewLoad =
2616             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2617                                      GroupMask, UndefVec, "wide.masked.vec");
2618       }
2619       else
2620         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2621                                             Group->getAlign(), "wide.vec");
2622       Group->addMetadata(NewLoad);
2623       NewLoads.push_back(NewLoad);
2624     }
2625 
2626     // For each member in the group, shuffle out the appropriate data from the
2627     // wide loads.
2628     unsigned J = 0;
2629     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2630       Instruction *Member = Group->getMember(I);
2631 
2632       // Skip the gaps in the group.
2633       if (!Member)
2634         continue;
2635 
2636       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2637       auto StrideMask =
2638           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2639       for (unsigned Part = 0; Part < UF; Part++) {
2640         Value *StridedVec = Builder.CreateShuffleVector(
2641             NewLoads[Part], StrideMask, "strided.vec");
2642 
2643         // If this member has different type, cast the result type.
2644         if (Member->getType() != ScalarTy) {
2645           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2646           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2647           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2648         }
2649 
2650         if (Group->isReverse())
2651           StridedVec = reverseVector(StridedVec);
2652 
2653         State.set(VPDefs[J], Member, StridedVec, Part);
2654       }
2655       ++J;
2656     }
2657     return;
2658   }
2659 
2660   // The sub vector type for current instruction.
2661   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2662   auto *SubVT = VectorType::get(ScalarTy, VF);
2663 
2664   // Vectorize the interleaved store group.
2665   for (unsigned Part = 0; Part < UF; Part++) {
2666     // Collect the stored vector from each member.
2667     SmallVector<Value *, 4> StoredVecs;
2668     for (unsigned i = 0; i < InterleaveFactor; i++) {
2669       // Interleaved store group doesn't allow a gap, so each index has a member
2670       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2671 
2672       Value *StoredVec = State.get(StoredValues[i], Part);
2673 
2674       if (Group->isReverse())
2675         StoredVec = reverseVector(StoredVec);
2676 
2677       // If this member has different type, cast it to a unified type.
2678 
2679       if (StoredVec->getType() != SubVT)
2680         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2681 
2682       StoredVecs.push_back(StoredVec);
2683     }
2684 
2685     // Concatenate all vectors into a wide vector.
2686     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2687 
2688     // Interleave the elements in the wide vector.
2689     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2690     Value *IVec = Builder.CreateShuffleVector(
2691         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2692         "interleaved.vec");
2693 
2694     Instruction *NewStoreInstr;
2695     if (BlockInMask) {
2696       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2697       Value *ShuffledMask = Builder.CreateShuffleVector(
2698           BlockInMaskPart,
2699           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2700           "interleaved.mask");
2701       NewStoreInstr = Builder.CreateMaskedStore(
2702           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2703     }
2704     else
2705       NewStoreInstr =
2706           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2707 
2708     Group->addMetadata(NewStoreInstr);
2709   }
2710 }
2711 
2712 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2713     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2714     VPValue *StoredValue, VPValue *BlockInMask) {
2715   // Attempt to issue a wide load.
2716   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2717   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2718 
2719   assert((LI || SI) && "Invalid Load/Store instruction");
2720   assert((!SI || StoredValue) && "No stored value provided for widened store");
2721   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2722 
2723   LoopVectorizationCostModel::InstWidening Decision =
2724       Cost->getWideningDecision(Instr, VF);
2725   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2726           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2727           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2728          "CM decision is not to widen the memory instruction");
2729 
2730   Type *ScalarDataTy = getMemInstValueType(Instr);
2731 
2732   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2733   const Align Alignment = getLoadStoreAlignment(Instr);
2734 
2735   // Determine if the pointer operand of the access is either consecutive or
2736   // reverse consecutive.
2737   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2738   bool ConsecutiveStride =
2739       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2740   bool CreateGatherScatter =
2741       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2742 
2743   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2744   // gather/scatter. Otherwise Decision should have been to Scalarize.
2745   assert((ConsecutiveStride || CreateGatherScatter) &&
2746          "The instruction should be scalarized");
2747   (void)ConsecutiveStride;
2748 
2749   VectorParts BlockInMaskParts(UF);
2750   bool isMaskRequired = BlockInMask;
2751   if (isMaskRequired)
2752     for (unsigned Part = 0; Part < UF; ++Part)
2753       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2754 
2755   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2756     // Calculate the pointer for the specific unroll-part.
2757     GetElementPtrInst *PartPtr = nullptr;
2758 
2759     bool InBounds = false;
2760     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2761       InBounds = gep->isInBounds();
2762 
2763     if (Reverse) {
2764       assert(!VF.isScalable() &&
2765              "Reversing vectors is not yet supported for scalable vectors.");
2766 
2767       // If the address is consecutive but reversed, then the
2768       // wide store needs to start at the last vector element.
2769       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2770           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2771       PartPtr->setIsInBounds(InBounds);
2772       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2773           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2774       PartPtr->setIsInBounds(InBounds);
2775       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2776         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2777     } else {
2778       Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2779       PartPtr = cast<GetElementPtrInst>(
2780           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2781       PartPtr->setIsInBounds(InBounds);
2782     }
2783 
2784     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2785     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2786   };
2787 
2788   // Handle Stores:
2789   if (SI) {
2790     setDebugLocFromInst(Builder, SI);
2791 
2792     for (unsigned Part = 0; Part < UF; ++Part) {
2793       Instruction *NewSI = nullptr;
2794       Value *StoredVal = State.get(StoredValue, Part);
2795       if (CreateGatherScatter) {
2796         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2797         Value *VectorGep = State.get(Addr, Part);
2798         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2799                                             MaskPart);
2800       } else {
2801         if (Reverse) {
2802           // If we store to reverse consecutive memory locations, then we need
2803           // to reverse the order of elements in the stored value.
2804           StoredVal = reverseVector(StoredVal);
2805           // We don't want to update the value in the map as it might be used in
2806           // another expression. So don't call resetVectorValue(StoredVal).
2807         }
2808         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2809         if (isMaskRequired)
2810           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2811                                             BlockInMaskParts[Part]);
2812         else
2813           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2814       }
2815       addMetadata(NewSI, SI);
2816     }
2817     return;
2818   }
2819 
2820   // Handle loads.
2821   assert(LI && "Must have a load instruction");
2822   setDebugLocFromInst(Builder, LI);
2823   for (unsigned Part = 0; Part < UF; ++Part) {
2824     Value *NewLI;
2825     if (CreateGatherScatter) {
2826       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2827       Value *VectorGep = State.get(Addr, Part);
2828       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2829                                          nullptr, "wide.masked.gather");
2830       addMetadata(NewLI, LI);
2831     } else {
2832       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2833       if (isMaskRequired)
2834         NewLI = Builder.CreateMaskedLoad(
2835             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2836             "wide.masked.load");
2837       else
2838         NewLI =
2839             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2840 
2841       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2842       addMetadata(NewLI, LI);
2843       if (Reverse)
2844         NewLI = reverseVector(NewLI);
2845     }
2846 
2847     State.set(Def, Instr, NewLI, Part);
2848   }
2849 }
2850 
2851 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2852                                                const VPIteration &Instance,
2853                                                bool IfPredicateInstr,
2854                                                VPTransformState &State) {
2855   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2856 
2857   setDebugLocFromInst(Builder, Instr);
2858 
2859   // Does this instruction return a value ?
2860   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2861 
2862   Instruction *Cloned = Instr->clone();
2863   if (!IsVoidRetTy)
2864     Cloned->setName(Instr->getName() + ".cloned");
2865 
2866   // Replace the operands of the cloned instructions with their scalar
2867   // equivalents in the new loop.
2868   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2869     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2870     auto InputInstance = Instance;
2871     if (!Operand || !OrigLoop->contains(Operand) ||
2872         (Cost->isUniformAfterVectorization(Operand, State.VF)))
2873       InputInstance.Lane = 0;
2874     auto *NewOp = State.get(User.getOperand(op), InputInstance);
2875     Cloned->setOperand(op, NewOp);
2876   }
2877   addNewMetadata(Cloned, Instr);
2878 
2879   // Place the cloned scalar in the new loop.
2880   Builder.Insert(Cloned);
2881 
2882   // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
2883   // representing scalar values in VPTransformState. Add the cloned scalar to
2884   // the scalar map entry.
2885   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2886 
2887   // If we just cloned a new assumption, add it the assumption cache.
2888   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2889     if (II->getIntrinsicID() == Intrinsic::assume)
2890       AC->registerAssumption(II);
2891 
2892   // End if-block.
2893   if (IfPredicateInstr)
2894     PredicatedInstructions.push_back(Cloned);
2895 }
2896 
2897 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2898                                                       Value *End, Value *Step,
2899                                                       Instruction *DL) {
2900   BasicBlock *Header = L->getHeader();
2901   BasicBlock *Latch = L->getLoopLatch();
2902   // As we're just creating this loop, it's possible no latch exists
2903   // yet. If so, use the header as this will be a single block loop.
2904   if (!Latch)
2905     Latch = Header;
2906 
2907   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2908   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2909   setDebugLocFromInst(Builder, OldInst);
2910   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2911 
2912   Builder.SetInsertPoint(Latch->getTerminator());
2913   setDebugLocFromInst(Builder, OldInst);
2914 
2915   // Create i+1 and fill the PHINode.
2916   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2917   Induction->addIncoming(Start, L->getLoopPreheader());
2918   Induction->addIncoming(Next, Latch);
2919   // Create the compare.
2920   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2921   Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
2922 
2923   // Now we have two terminators. Remove the old one from the block.
2924   Latch->getTerminator()->eraseFromParent();
2925 
2926   return Induction;
2927 }
2928 
2929 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2930   if (TripCount)
2931     return TripCount;
2932 
2933   assert(L && "Create Trip Count for null loop.");
2934   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2935   // Find the loop boundaries.
2936   ScalarEvolution *SE = PSE.getSE();
2937   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2938   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2939          "Invalid loop count");
2940 
2941   Type *IdxTy = Legal->getWidestInductionType();
2942   assert(IdxTy && "No type for induction");
2943 
2944   // The exit count might have the type of i64 while the phi is i32. This can
2945   // happen if we have an induction variable that is sign extended before the
2946   // compare. The only way that we get a backedge taken count is that the
2947   // induction variable was signed and as such will not overflow. In such a case
2948   // truncation is legal.
2949   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2950       IdxTy->getPrimitiveSizeInBits())
2951     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2952   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2953 
2954   // Get the total trip count from the count by adding 1.
2955   const SCEV *ExitCount = SE->getAddExpr(
2956       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2957 
2958   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2959 
2960   // Expand the trip count and place the new instructions in the preheader.
2961   // Notice that the pre-header does not change, only the loop body.
2962   SCEVExpander Exp(*SE, DL, "induction");
2963 
2964   // Count holds the overall loop count (N).
2965   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2966                                 L->getLoopPreheader()->getTerminator());
2967 
2968   if (TripCount->getType()->isPointerTy())
2969     TripCount =
2970         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2971                                     L->getLoopPreheader()->getTerminator());
2972 
2973   return TripCount;
2974 }
2975 
2976 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2977   if (VectorTripCount)
2978     return VectorTripCount;
2979 
2980   Value *TC = getOrCreateTripCount(L);
2981   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2982 
2983   Type *Ty = TC->getType();
2984   // This is where we can make the step a runtime constant.
2985   Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
2986 
2987   // If the tail is to be folded by masking, round the number of iterations N
2988   // up to a multiple of Step instead of rounding down. This is done by first
2989   // adding Step-1 and then rounding down. Note that it's ok if this addition
2990   // overflows: the vector induction variable will eventually wrap to zero given
2991   // that it starts at zero and its Step is a power of two; the loop will then
2992   // exit, with the last early-exit vector comparison also producing all-true.
2993   if (Cost->foldTailByMasking()) {
2994     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2995            "VF*UF must be a power of 2 when folding tail by masking");
2996     assert(!VF.isScalable() &&
2997            "Tail folding not yet supported for scalable vectors");
2998     TC = Builder.CreateAdd(
2999         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3000   }
3001 
3002   // Now we need to generate the expression for the part of the loop that the
3003   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3004   // iterations are not required for correctness, or N - Step, otherwise. Step
3005   // is equal to the vectorization factor (number of SIMD elements) times the
3006   // unroll factor (number of SIMD instructions).
3007   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3008 
3009   // There are two cases where we need to ensure (at least) the last iteration
3010   // runs in the scalar remainder loop. Thus, if the step evenly divides
3011   // the trip count, we set the remainder to be equal to the step. If the step
3012   // does not evenly divide the trip count, no adjustment is necessary since
3013   // there will already be scalar iterations. Note that the minimum iterations
3014   // check ensures that N >= Step. The cases are:
3015   // 1) If there is a non-reversed interleaved group that may speculatively
3016   //    access memory out-of-bounds.
3017   // 2) If any instruction may follow a conditionally taken exit. That is, if
3018   //    the loop contains multiple exiting blocks, or a single exiting block
3019   //    which is not the latch.
3020   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3021     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3022     R = Builder.CreateSelect(IsZero, Step, R);
3023   }
3024 
3025   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3026 
3027   return VectorTripCount;
3028 }
3029 
3030 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3031                                                    const DataLayout &DL) {
3032   // Verify that V is a vector type with same number of elements as DstVTy.
3033   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3034   unsigned VF = DstFVTy->getNumElements();
3035   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3036   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3037   Type *SrcElemTy = SrcVecTy->getElementType();
3038   Type *DstElemTy = DstFVTy->getElementType();
3039   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3040          "Vector elements must have same size");
3041 
3042   // Do a direct cast if element types are castable.
3043   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3044     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3045   }
3046   // V cannot be directly casted to desired vector type.
3047   // May happen when V is a floating point vector but DstVTy is a vector of
3048   // pointers or vice-versa. Handle this using a two-step bitcast using an
3049   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3050   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3051          "Only one type should be a pointer type");
3052   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3053          "Only one type should be a floating point type");
3054   Type *IntTy =
3055       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3056   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3057   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3058   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3059 }
3060 
3061 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3062                                                          BasicBlock *Bypass) {
3063   Value *Count = getOrCreateTripCount(L);
3064   // Reuse existing vector loop preheader for TC checks.
3065   // Note that new preheader block is generated for vector loop.
3066   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3067   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3068 
3069   // Generate code to check if the loop's trip count is less than VF * UF, or
3070   // equal to it in case a scalar epilogue is required; this implies that the
3071   // vector trip count is zero. This check also covers the case where adding one
3072   // to the backedge-taken count overflowed leading to an incorrect trip count
3073   // of zero. In this case we will also jump to the scalar loop.
3074   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3075                                           : ICmpInst::ICMP_ULT;
3076 
3077   // If tail is to be folded, vector loop takes care of all iterations.
3078   Value *CheckMinIters = Builder.getFalse();
3079   if (!Cost->foldTailByMasking()) {
3080     Value *Step =
3081         createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3082     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3083   }
3084   // Create new preheader for vector loop.
3085   LoopVectorPreHeader =
3086       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3087                  "vector.ph");
3088 
3089   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3090                                DT->getNode(Bypass)->getIDom()) &&
3091          "TC check is expected to dominate Bypass");
3092 
3093   // Update dominator for Bypass & LoopExit.
3094   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3095   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3096 
3097   ReplaceInstWithInst(
3098       TCCheckBlock->getTerminator(),
3099       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3100   LoopBypassBlocks.push_back(TCCheckBlock);
3101 }
3102 
3103 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3104   // Reuse existing vector loop preheader for SCEV checks.
3105   // Note that new preheader block is generated for vector loop.
3106   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
3107 
3108   // Generate the code to check that the SCEV assumptions that we made.
3109   // We want the new basic block to start at the first instruction in a
3110   // sequence of instructions that form a check.
3111   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3112                    "scev.check");
3113   Value *SCEVCheck = Exp.expandCodeForPredicate(
3114       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
3115 
3116   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3117     if (C->isZero())
3118       return;
3119 
3120   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3121            (OptForSizeBasedOnProfile &&
3122             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3123          "Cannot SCEV check stride or overflow when optimizing for size");
3124 
3125   SCEVCheckBlock->setName("vector.scevcheck");
3126   // Create new preheader for vector loop.
3127   LoopVectorPreHeader =
3128       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
3129                  nullptr, "vector.ph");
3130 
3131   // Update dominator only if this is first RT check.
3132   if (LoopBypassBlocks.empty()) {
3133     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3134     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3135   }
3136 
3137   ReplaceInstWithInst(
3138       SCEVCheckBlock->getTerminator(),
3139       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
3140   LoopBypassBlocks.push_back(SCEVCheckBlock);
3141   AddedSafetyChecks = true;
3142 }
3143 
3144 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3145   // VPlan-native path does not do any analysis for runtime checks currently.
3146   if (EnableVPlanNativePath)
3147     return;
3148 
3149   // Reuse existing vector loop preheader for runtime memory checks.
3150   // Note that new preheader block is generated for vector loop.
3151   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
3152 
3153   // Generate the code that checks in runtime if arrays overlap. We put the
3154   // checks into a separate block to make the more common case of few elements
3155   // faster.
3156   auto *LAI = Legal->getLAI();
3157   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
3158   if (!RtPtrChecking.Need)
3159     return;
3160 
3161   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3162     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3163            "Cannot emit memory checks when optimizing for size, unless forced "
3164            "to vectorize.");
3165     ORE->emit([&]() {
3166       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3167                                         L->getStartLoc(), L->getHeader())
3168              << "Code-size may be reduced by not forcing "
3169                 "vectorization, or by source-code modifications "
3170                 "eliminating the need for runtime checks "
3171                 "(e.g., adding 'restrict').";
3172     });
3173   }
3174 
3175   MemCheckBlock->setName("vector.memcheck");
3176   // Create new preheader for vector loop.
3177   LoopVectorPreHeader =
3178       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
3179                  "vector.ph");
3180 
3181   auto *CondBranch = cast<BranchInst>(
3182       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
3183   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
3184   LoopBypassBlocks.push_back(MemCheckBlock);
3185   AddedSafetyChecks = true;
3186 
3187   // Update dominator only if this is first RT check.
3188   if (LoopBypassBlocks.empty()) {
3189     DT->changeImmediateDominator(Bypass, MemCheckBlock);
3190     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
3191   }
3192 
3193   Instruction *FirstCheckInst;
3194   Instruction *MemRuntimeCheck;
3195   std::tie(FirstCheckInst, MemRuntimeCheck) =
3196       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
3197                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
3198   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
3199                             "claimed checks are required");
3200   CondBranch->setCondition(MemRuntimeCheck);
3201 
3202   // We currently don't use LoopVersioning for the actual loop cloning but we
3203   // still use it to add the noalias metadata.
3204   LVer = std::make_unique<LoopVersioning>(
3205       *Legal->getLAI(),
3206       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3207       DT, PSE.getSE());
3208   LVer->prepareNoAliasMetadata();
3209 }
3210 
3211 Value *InnerLoopVectorizer::emitTransformedIndex(
3212     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3213     const InductionDescriptor &ID) const {
3214 
3215   SCEVExpander Exp(*SE, DL, "induction");
3216   auto Step = ID.getStep();
3217   auto StartValue = ID.getStartValue();
3218   assert(Index->getType() == Step->getType() &&
3219          "Index type does not match StepValue type");
3220 
3221   // Note: the IR at this point is broken. We cannot use SE to create any new
3222   // SCEV and then expand it, hoping that SCEV's simplification will give us
3223   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3224   // lead to various SCEV crashes. So all we can do is to use builder and rely
3225   // on InstCombine for future simplifications. Here we handle some trivial
3226   // cases only.
3227   auto CreateAdd = [&B](Value *X, Value *Y) {
3228     assert(X->getType() == Y->getType() && "Types don't match!");
3229     if (auto *CX = dyn_cast<ConstantInt>(X))
3230       if (CX->isZero())
3231         return Y;
3232     if (auto *CY = dyn_cast<ConstantInt>(Y))
3233       if (CY->isZero())
3234         return X;
3235     return B.CreateAdd(X, Y);
3236   };
3237 
3238   auto CreateMul = [&B](Value *X, Value *Y) {
3239     assert(X->getType() == Y->getType() && "Types don't match!");
3240     if (auto *CX = dyn_cast<ConstantInt>(X))
3241       if (CX->isOne())
3242         return Y;
3243     if (auto *CY = dyn_cast<ConstantInt>(Y))
3244       if (CY->isOne())
3245         return X;
3246     return B.CreateMul(X, Y);
3247   };
3248 
3249   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3250   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3251   // the DomTree is not kept up-to-date for additional blocks generated in the
3252   // vector loop. By using the header as insertion point, we guarantee that the
3253   // expanded instructions dominate all their uses.
3254   auto GetInsertPoint = [this, &B]() {
3255     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3256     if (InsertBB != LoopVectorBody &&
3257         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3258       return LoopVectorBody->getTerminator();
3259     return &*B.GetInsertPoint();
3260   };
3261   switch (ID.getKind()) {
3262   case InductionDescriptor::IK_IntInduction: {
3263     assert(Index->getType() == StartValue->getType() &&
3264            "Index type does not match StartValue type");
3265     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3266       return B.CreateSub(StartValue, Index);
3267     auto *Offset = CreateMul(
3268         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3269     return CreateAdd(StartValue, Offset);
3270   }
3271   case InductionDescriptor::IK_PtrInduction: {
3272     assert(isa<SCEVConstant>(Step) &&
3273            "Expected constant step for pointer induction");
3274     return B.CreateGEP(
3275         StartValue->getType()->getPointerElementType(), StartValue,
3276         CreateMul(Index,
3277                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3278   }
3279   case InductionDescriptor::IK_FpInduction: {
3280     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3281     auto InductionBinOp = ID.getInductionBinOp();
3282     assert(InductionBinOp &&
3283            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3284             InductionBinOp->getOpcode() == Instruction::FSub) &&
3285            "Original bin op should be defined for FP induction");
3286 
3287     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3288 
3289     // Floating point operations had to be 'fast' to enable the induction.
3290     FastMathFlags Flags;
3291     Flags.setFast();
3292 
3293     Value *MulExp = B.CreateFMul(StepValue, Index);
3294     if (isa<Instruction>(MulExp))
3295       // We have to check, the MulExp may be a constant.
3296       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3297 
3298     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3299                                "induction");
3300     if (isa<Instruction>(BOp))
3301       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3302 
3303     return BOp;
3304   }
3305   case InductionDescriptor::IK_NoInduction:
3306     return nullptr;
3307   }
3308   llvm_unreachable("invalid enum");
3309 }
3310 
3311 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3312   LoopScalarBody = OrigLoop->getHeader();
3313   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3314   LoopExitBlock = OrigLoop->getUniqueExitBlock();
3315   assert(LoopExitBlock && "Must have an exit block");
3316   assert(LoopVectorPreHeader && "Invalid loop structure");
3317 
3318   LoopMiddleBlock =
3319       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3320                  LI, nullptr, Twine(Prefix) + "middle.block");
3321   LoopScalarPreHeader =
3322       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3323                  nullptr, Twine(Prefix) + "scalar.ph");
3324 
3325   // Set up branch from middle block to the exit and scalar preheader blocks.
3326   // completeLoopSkeleton will update the condition to use an iteration check,
3327   // if required to decide whether to execute the remainder.
3328   BranchInst *BrInst =
3329       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
3330   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3331   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3332   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3333 
3334   // We intentionally don't let SplitBlock to update LoopInfo since
3335   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3336   // LoopVectorBody is explicitly added to the correct place few lines later.
3337   LoopVectorBody =
3338       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3339                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3340 
3341   // Update dominator for loop exit.
3342   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3343 
3344   // Create and register the new vector loop.
3345   Loop *Lp = LI->AllocateLoop();
3346   Loop *ParentLoop = OrigLoop->getParentLoop();
3347 
3348   // Insert the new loop into the loop nest and register the new basic blocks
3349   // before calling any utilities such as SCEV that require valid LoopInfo.
3350   if (ParentLoop) {
3351     ParentLoop->addChildLoop(Lp);
3352   } else {
3353     LI->addTopLevelLoop(Lp);
3354   }
3355   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3356   return Lp;
3357 }
3358 
3359 void InnerLoopVectorizer::createInductionResumeValues(
3360     Loop *L, Value *VectorTripCount,
3361     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3362   assert(VectorTripCount && L && "Expected valid arguments");
3363   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3364           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3365          "Inconsistent information about additional bypass.");
3366   // We are going to resume the execution of the scalar loop.
3367   // Go over all of the induction variables that we found and fix the
3368   // PHIs that are left in the scalar version of the loop.
3369   // The starting values of PHI nodes depend on the counter of the last
3370   // iteration in the vectorized loop.
3371   // If we come from a bypass edge then we need to start from the original
3372   // start value.
3373   for (auto &InductionEntry : Legal->getInductionVars()) {
3374     PHINode *OrigPhi = InductionEntry.first;
3375     InductionDescriptor II = InductionEntry.second;
3376 
3377     // Create phi nodes to merge from the  backedge-taken check block.
3378     PHINode *BCResumeVal =
3379         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3380                         LoopScalarPreHeader->getTerminator());
3381     // Copy original phi DL over to the new one.
3382     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3383     Value *&EndValue = IVEndValues[OrigPhi];
3384     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3385     if (OrigPhi == OldInduction) {
3386       // We know what the end value is.
3387       EndValue = VectorTripCount;
3388     } else {
3389       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3390       Type *StepType = II.getStep()->getType();
3391       Instruction::CastOps CastOp =
3392           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3393       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3394       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3395       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3396       EndValue->setName("ind.end");
3397 
3398       // Compute the end value for the additional bypass (if applicable).
3399       if (AdditionalBypass.first) {
3400         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3401         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3402                                          StepType, true);
3403         CRD =
3404             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3405         EndValueFromAdditionalBypass =
3406             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3407         EndValueFromAdditionalBypass->setName("ind.end");
3408       }
3409     }
3410     // The new PHI merges the original incoming value, in case of a bypass,
3411     // or the value at the end of the vectorized loop.
3412     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3413 
3414     // Fix the scalar body counter (PHI node).
3415     // The old induction's phi node in the scalar body needs the truncated
3416     // value.
3417     for (BasicBlock *BB : LoopBypassBlocks)
3418       BCResumeVal->addIncoming(II.getStartValue(), BB);
3419 
3420     if (AdditionalBypass.first)
3421       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3422                                             EndValueFromAdditionalBypass);
3423 
3424     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3425   }
3426 }
3427 
3428 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3429                                                       MDNode *OrigLoopID) {
3430   assert(L && "Expected valid loop.");
3431 
3432   // The trip counts should be cached by now.
3433   Value *Count = getOrCreateTripCount(L);
3434   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3435 
3436   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3437 
3438   // Add a check in the middle block to see if we have completed
3439   // all of the iterations in the first vector loop.
3440   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3441   // If tail is to be folded, we know we don't need to run the remainder.
3442   if (!Cost->foldTailByMasking()) {
3443     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3444                                         Count, VectorTripCount, "cmp.n",
3445                                         LoopMiddleBlock->getTerminator());
3446 
3447     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3448     // of the corresponding compare because they may have ended up with
3449     // different line numbers and we want to avoid awkward line stepping while
3450     // debugging. Eg. if the compare has got a line number inside the loop.
3451     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3452     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3453   }
3454 
3455   // Get ready to start creating new instructions into the vectorized body.
3456   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3457          "Inconsistent vector loop preheader");
3458   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3459 
3460   Optional<MDNode *> VectorizedLoopID =
3461       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3462                                       LLVMLoopVectorizeFollowupVectorized});
3463   if (VectorizedLoopID.hasValue()) {
3464     L->setLoopID(VectorizedLoopID.getValue());
3465 
3466     // Do not setAlreadyVectorized if loop attributes have been defined
3467     // explicitly.
3468     return LoopVectorPreHeader;
3469   }
3470 
3471   // Keep all loop hints from the original loop on the vector loop (we'll
3472   // replace the vectorizer-specific hints below).
3473   if (MDNode *LID = OrigLoop->getLoopID())
3474     L->setLoopID(LID);
3475 
3476   LoopVectorizeHints Hints(L, true, *ORE);
3477   Hints.setAlreadyVectorized();
3478 
3479 #ifdef EXPENSIVE_CHECKS
3480   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3481   LI->verify(*DT);
3482 #endif
3483 
3484   return LoopVectorPreHeader;
3485 }
3486 
3487 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3488   /*
3489    In this function we generate a new loop. The new loop will contain
3490    the vectorized instructions while the old loop will continue to run the
3491    scalar remainder.
3492 
3493        [ ] <-- loop iteration number check.
3494     /   |
3495    /    v
3496   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3497   |  /  |
3498   | /   v
3499   ||   [ ]     <-- vector pre header.
3500   |/    |
3501   |     v
3502   |    [  ] \
3503   |    [  ]_|   <-- vector loop.
3504   |     |
3505   |     v
3506   |   -[ ]   <--- middle-block.
3507   |  /  |
3508   | /   v
3509   -|- >[ ]     <--- new preheader.
3510    |    |
3511    |    v
3512    |   [ ] \
3513    |   [ ]_|   <-- old scalar loop to handle remainder.
3514     \   |
3515      \  v
3516       >[ ]     <-- exit block.
3517    ...
3518    */
3519 
3520   // Get the metadata of the original loop before it gets modified.
3521   MDNode *OrigLoopID = OrigLoop->getLoopID();
3522 
3523   // Create an empty vector loop, and prepare basic blocks for the runtime
3524   // checks.
3525   Loop *Lp = createVectorLoopSkeleton("");
3526 
3527   // Now, compare the new count to zero. If it is zero skip the vector loop and
3528   // jump to the scalar loop. This check also covers the case where the
3529   // backedge-taken count is uint##_max: adding one to it will overflow leading
3530   // to an incorrect trip count of zero. In this (rare) case we will also jump
3531   // to the scalar loop.
3532   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3533 
3534   // Generate the code to check any assumptions that we've made for SCEV
3535   // expressions.
3536   emitSCEVChecks(Lp, LoopScalarPreHeader);
3537 
3538   // Generate the code that checks in runtime if arrays overlap. We put the
3539   // checks into a separate block to make the more common case of few elements
3540   // faster.
3541   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3542 
3543   // Some loops have a single integer induction variable, while other loops
3544   // don't. One example is c++ iterators that often have multiple pointer
3545   // induction variables. In the code below we also support a case where we
3546   // don't have a single induction variable.
3547   //
3548   // We try to obtain an induction variable from the original loop as hard
3549   // as possible. However if we don't find one that:
3550   //   - is an integer
3551   //   - counts from zero, stepping by one
3552   //   - is the size of the widest induction variable type
3553   // then we create a new one.
3554   OldInduction = Legal->getPrimaryInduction();
3555   Type *IdxTy = Legal->getWidestInductionType();
3556   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3557   // The loop step is equal to the vectorization factor (num of SIMD elements)
3558   // times the unroll factor (num of SIMD instructions).
3559   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3560   Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3561   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3562   Induction =
3563       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3564                               getDebugLocFromInstOrOperands(OldInduction));
3565 
3566   // Emit phis for the new starting index of the scalar loop.
3567   createInductionResumeValues(Lp, CountRoundDown);
3568 
3569   return completeLoopSkeleton(Lp, OrigLoopID);
3570 }
3571 
3572 // Fix up external users of the induction variable. At this point, we are
3573 // in LCSSA form, with all external PHIs that use the IV having one input value,
3574 // coming from the remainder loop. We need those PHIs to also have a correct
3575 // value for the IV when arriving directly from the middle block.
3576 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3577                                        const InductionDescriptor &II,
3578                                        Value *CountRoundDown, Value *EndValue,
3579                                        BasicBlock *MiddleBlock) {
3580   // There are two kinds of external IV usages - those that use the value
3581   // computed in the last iteration (the PHI) and those that use the penultimate
3582   // value (the value that feeds into the phi from the loop latch).
3583   // We allow both, but they, obviously, have different values.
3584 
3585   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3586 
3587   DenseMap<Value *, Value *> MissingVals;
3588 
3589   // An external user of the last iteration's value should see the value that
3590   // the remainder loop uses to initialize its own IV.
3591   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3592   for (User *U : PostInc->users()) {
3593     Instruction *UI = cast<Instruction>(U);
3594     if (!OrigLoop->contains(UI)) {
3595       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3596       MissingVals[UI] = EndValue;
3597     }
3598   }
3599 
3600   // An external user of the penultimate value need to see EndValue - Step.
3601   // The simplest way to get this is to recompute it from the constituent SCEVs,
3602   // that is Start + (Step * (CRD - 1)).
3603   for (User *U : OrigPhi->users()) {
3604     auto *UI = cast<Instruction>(U);
3605     if (!OrigLoop->contains(UI)) {
3606       const DataLayout &DL =
3607           OrigLoop->getHeader()->getModule()->getDataLayout();
3608       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3609 
3610       IRBuilder<> B(MiddleBlock->getTerminator());
3611       Value *CountMinusOne = B.CreateSub(
3612           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3613       Value *CMO =
3614           !II.getStep()->getType()->isIntegerTy()
3615               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3616                              II.getStep()->getType())
3617               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3618       CMO->setName("cast.cmo");
3619       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3620       Escape->setName("ind.escape");
3621       MissingVals[UI] = Escape;
3622     }
3623   }
3624 
3625   for (auto &I : MissingVals) {
3626     PHINode *PHI = cast<PHINode>(I.first);
3627     // One corner case we have to handle is two IVs "chasing" each-other,
3628     // that is %IV2 = phi [...], [ %IV1, %latch ]
3629     // In this case, if IV1 has an external use, we need to avoid adding both
3630     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3631     // don't already have an incoming value for the middle block.
3632     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3633       PHI->addIncoming(I.second, MiddleBlock);
3634   }
3635 }
3636 
3637 namespace {
3638 
3639 struct CSEDenseMapInfo {
3640   static bool canHandle(const Instruction *I) {
3641     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3642            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3643   }
3644 
3645   static inline Instruction *getEmptyKey() {
3646     return DenseMapInfo<Instruction *>::getEmptyKey();
3647   }
3648 
3649   static inline Instruction *getTombstoneKey() {
3650     return DenseMapInfo<Instruction *>::getTombstoneKey();
3651   }
3652 
3653   static unsigned getHashValue(const Instruction *I) {
3654     assert(canHandle(I) && "Unknown instruction!");
3655     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3656                                                            I->value_op_end()));
3657   }
3658 
3659   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3660     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3661         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3662       return LHS == RHS;
3663     return LHS->isIdenticalTo(RHS);
3664   }
3665 };
3666 
3667 } // end anonymous namespace
3668 
3669 ///Perform cse of induction variable instructions.
3670 static void cse(BasicBlock *BB) {
3671   // Perform simple cse.
3672   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3673   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3674     Instruction *In = &*I++;
3675 
3676     if (!CSEDenseMapInfo::canHandle(In))
3677       continue;
3678 
3679     // Check if we can replace this instruction with any of the
3680     // visited instructions.
3681     if (Instruction *V = CSEMap.lookup(In)) {
3682       In->replaceAllUsesWith(V);
3683       In->eraseFromParent();
3684       continue;
3685     }
3686 
3687     CSEMap[In] = In;
3688   }
3689 }
3690 
3691 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3692                                                        ElementCount VF,
3693                                                        bool &NeedToScalarize) {
3694   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3695   Function *F = CI->getCalledFunction();
3696   Type *ScalarRetTy = CI->getType();
3697   SmallVector<Type *, 4> Tys, ScalarTys;
3698   for (auto &ArgOp : CI->arg_operands())
3699     ScalarTys.push_back(ArgOp->getType());
3700 
3701   // Estimate cost of scalarized vector call. The source operands are assumed
3702   // to be vectors, so we need to extract individual elements from there,
3703   // execute VF scalar calls, and then gather the result into the vector return
3704   // value.
3705   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3706                                                  TTI::TCK_RecipThroughput);
3707   if (VF.isScalar())
3708     return ScalarCallCost;
3709 
3710   // Compute corresponding vector type for return value and arguments.
3711   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3712   for (Type *ScalarTy : ScalarTys)
3713     Tys.push_back(ToVectorTy(ScalarTy, VF));
3714 
3715   // Compute costs of unpacking argument values for the scalar calls and
3716   // packing the return values to a vector.
3717   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3718 
3719   unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3720 
3721   // If we can't emit a vector call for this function, then the currently found
3722   // cost is the cost we need to return.
3723   NeedToScalarize = true;
3724   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3725   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3726 
3727   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3728     return Cost;
3729 
3730   // If the corresponding vector cost is cheaper, return its cost.
3731   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3732                                                  TTI::TCK_RecipThroughput);
3733   if (VectorCallCost < Cost) {
3734     NeedToScalarize = false;
3735     return VectorCallCost;
3736   }
3737   return Cost;
3738 }
3739 
3740 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3741                                                             ElementCount VF) {
3742   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3743   assert(ID && "Expected intrinsic call!");
3744 
3745   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3746   return TTI.getIntrinsicInstrCost(CostAttrs,
3747                                    TargetTransformInfo::TCK_RecipThroughput);
3748 }
3749 
3750 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3751   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3752   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3753   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3754 }
3755 
3756 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3757   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3758   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3759   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3760 }
3761 
3762 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3763   // For every instruction `I` in MinBWs, truncate the operands, create a
3764   // truncated version of `I` and reextend its result. InstCombine runs
3765   // later and will remove any ext/trunc pairs.
3766   SmallPtrSet<Value *, 4> Erased;
3767   for (const auto &KV : Cost->getMinimalBitwidths()) {
3768     // If the value wasn't vectorized, we must maintain the original scalar
3769     // type. The absence of the value from VectorLoopValueMap indicates that it
3770     // wasn't vectorized.
3771     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3772       continue;
3773     for (unsigned Part = 0; Part < UF; ++Part) {
3774       Value *I = getOrCreateVectorValue(KV.first, Part);
3775       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3776         continue;
3777       Type *OriginalTy = I->getType();
3778       Type *ScalarTruncatedTy =
3779           IntegerType::get(OriginalTy->getContext(), KV.second);
3780       auto *TruncatedTy = FixedVectorType::get(
3781           ScalarTruncatedTy,
3782           cast<FixedVectorType>(OriginalTy)->getNumElements());
3783       if (TruncatedTy == OriginalTy)
3784         continue;
3785 
3786       IRBuilder<> B(cast<Instruction>(I));
3787       auto ShrinkOperand = [&](Value *V) -> Value * {
3788         if (auto *ZI = dyn_cast<ZExtInst>(V))
3789           if (ZI->getSrcTy() == TruncatedTy)
3790             return ZI->getOperand(0);
3791         return B.CreateZExtOrTrunc(V, TruncatedTy);
3792       };
3793 
3794       // The actual instruction modification depends on the instruction type,
3795       // unfortunately.
3796       Value *NewI = nullptr;
3797       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3798         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3799                              ShrinkOperand(BO->getOperand(1)));
3800 
3801         // Any wrapping introduced by shrinking this operation shouldn't be
3802         // considered undefined behavior. So, we can't unconditionally copy
3803         // arithmetic wrapping flags to NewI.
3804         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3805       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3806         NewI =
3807             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3808                          ShrinkOperand(CI->getOperand(1)));
3809       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3810         NewI = B.CreateSelect(SI->getCondition(),
3811                               ShrinkOperand(SI->getTrueValue()),
3812                               ShrinkOperand(SI->getFalseValue()));
3813       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3814         switch (CI->getOpcode()) {
3815         default:
3816           llvm_unreachable("Unhandled cast!");
3817         case Instruction::Trunc:
3818           NewI = ShrinkOperand(CI->getOperand(0));
3819           break;
3820         case Instruction::SExt:
3821           NewI = B.CreateSExtOrTrunc(
3822               CI->getOperand(0),
3823               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3824           break;
3825         case Instruction::ZExt:
3826           NewI = B.CreateZExtOrTrunc(
3827               CI->getOperand(0),
3828               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3829           break;
3830         }
3831       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3832         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3833                              ->getNumElements();
3834         auto *O0 = B.CreateZExtOrTrunc(
3835             SI->getOperand(0),
3836             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3837         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3838                              ->getNumElements();
3839         auto *O1 = B.CreateZExtOrTrunc(
3840             SI->getOperand(1),
3841             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3842 
3843         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3844       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3845         // Don't do anything with the operands, just extend the result.
3846         continue;
3847       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3848         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3849                             ->getNumElements();
3850         auto *O0 = B.CreateZExtOrTrunc(
3851             IE->getOperand(0),
3852             FixedVectorType::get(ScalarTruncatedTy, Elements));
3853         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3854         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3855       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3856         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3857                             ->getNumElements();
3858         auto *O0 = B.CreateZExtOrTrunc(
3859             EE->getOperand(0),
3860             FixedVectorType::get(ScalarTruncatedTy, Elements));
3861         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3862       } else {
3863         // If we don't know what to do, be conservative and don't do anything.
3864         continue;
3865       }
3866 
3867       // Lastly, extend the result.
3868       NewI->takeName(cast<Instruction>(I));
3869       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3870       I->replaceAllUsesWith(Res);
3871       cast<Instruction>(I)->eraseFromParent();
3872       Erased.insert(I);
3873       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3874     }
3875   }
3876 
3877   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3878   for (const auto &KV : Cost->getMinimalBitwidths()) {
3879     // If the value wasn't vectorized, we must maintain the original scalar
3880     // type. The absence of the value from VectorLoopValueMap indicates that it
3881     // wasn't vectorized.
3882     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3883       continue;
3884     for (unsigned Part = 0; Part < UF; ++Part) {
3885       Value *I = getOrCreateVectorValue(KV.first, Part);
3886       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3887       if (Inst && Inst->use_empty()) {
3888         Value *NewI = Inst->getOperand(0);
3889         Inst->eraseFromParent();
3890         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3891       }
3892     }
3893   }
3894 }
3895 
3896 void InnerLoopVectorizer::fixVectorizedLoop() {
3897   // Insert truncates and extends for any truncated instructions as hints to
3898   // InstCombine.
3899   if (VF.isVector())
3900     truncateToMinimalBitwidths();
3901 
3902   // Fix widened non-induction PHIs by setting up the PHI operands.
3903   if (OrigPHIsToFix.size()) {
3904     assert(EnableVPlanNativePath &&
3905            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3906     fixNonInductionPHIs();
3907   }
3908 
3909   // At this point every instruction in the original loop is widened to a
3910   // vector form. Now we need to fix the recurrences in the loop. These PHI
3911   // nodes are currently empty because we did not want to introduce cycles.
3912   // This is the second stage of vectorizing recurrences.
3913   fixCrossIterationPHIs();
3914 
3915   // Forget the original basic block.
3916   PSE.getSE()->forgetLoop(OrigLoop);
3917 
3918   // Fix-up external users of the induction variables.
3919   for (auto &Entry : Legal->getInductionVars())
3920     fixupIVUsers(Entry.first, Entry.second,
3921                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3922                  IVEndValues[Entry.first], LoopMiddleBlock);
3923 
3924   fixLCSSAPHIs();
3925   for (Instruction *PI : PredicatedInstructions)
3926     sinkScalarOperands(&*PI);
3927 
3928   // Remove redundant induction instructions.
3929   cse(LoopVectorBody);
3930 
3931   // Set/update profile weights for the vector and remainder loops as original
3932   // loop iterations are now distributed among them. Note that original loop
3933   // represented by LoopScalarBody becomes remainder loop after vectorization.
3934   //
3935   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3936   // end up getting slightly roughened result but that should be OK since
3937   // profile is not inherently precise anyway. Note also possible bypass of
3938   // vector code caused by legality checks is ignored, assigning all the weight
3939   // to the vector loop, optimistically.
3940   //
3941   // For scalable vectorization we can't know at compile time how many iterations
3942   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3943   // vscale of '1'.
3944   setProfileInfoAfterUnrolling(
3945       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3946       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3947 }
3948 
3949 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3950   // In order to support recurrences we need to be able to vectorize Phi nodes.
3951   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3952   // stage #2: We now need to fix the recurrences by adding incoming edges to
3953   // the currently empty PHI nodes. At this point every instruction in the
3954   // original loop is widened to a vector form so we can use them to construct
3955   // the incoming edges.
3956   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3957     // Handle first-order recurrences and reductions that need to be fixed.
3958     if (Legal->isFirstOrderRecurrence(&Phi))
3959       fixFirstOrderRecurrence(&Phi);
3960     else if (Legal->isReductionVariable(&Phi))
3961       fixReduction(&Phi);
3962   }
3963 }
3964 
3965 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3966   // This is the second phase of vectorizing first-order recurrences. An
3967   // overview of the transformation is described below. Suppose we have the
3968   // following loop.
3969   //
3970   //   for (int i = 0; i < n; ++i)
3971   //     b[i] = a[i] - a[i - 1];
3972   //
3973   // There is a first-order recurrence on "a". For this loop, the shorthand
3974   // scalar IR looks like:
3975   //
3976   //   scalar.ph:
3977   //     s_init = a[-1]
3978   //     br scalar.body
3979   //
3980   //   scalar.body:
3981   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3982   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3983   //     s2 = a[i]
3984   //     b[i] = s2 - s1
3985   //     br cond, scalar.body, ...
3986   //
3987   // In this example, s1 is a recurrence because it's value depends on the
3988   // previous iteration. In the first phase of vectorization, we created a
3989   // temporary value for s1. We now complete the vectorization and produce the
3990   // shorthand vector IR shown below (for VF = 4, UF = 1).
3991   //
3992   //   vector.ph:
3993   //     v_init = vector(..., ..., ..., a[-1])
3994   //     br vector.body
3995   //
3996   //   vector.body
3997   //     i = phi [0, vector.ph], [i+4, vector.body]
3998   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3999   //     v2 = a[i, i+1, i+2, i+3];
4000   //     v3 = vector(v1(3), v2(0, 1, 2))
4001   //     b[i, i+1, i+2, i+3] = v2 - v3
4002   //     br cond, vector.body, middle.block
4003   //
4004   //   middle.block:
4005   //     x = v2(3)
4006   //     br scalar.ph
4007   //
4008   //   scalar.ph:
4009   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4010   //     br scalar.body
4011   //
4012   // After execution completes the vector loop, we extract the next value of
4013   // the recurrence (x) to use as the initial value in the scalar loop.
4014 
4015   // Get the original loop preheader and single loop latch.
4016   auto *Preheader = OrigLoop->getLoopPreheader();
4017   auto *Latch = OrigLoop->getLoopLatch();
4018 
4019   // Get the initial and previous values of the scalar recurrence.
4020   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4021   auto *Previous = Phi->getIncomingValueForBlock(Latch);
4022 
4023   // Create a vector from the initial value.
4024   auto *VectorInit = ScalarInit;
4025   if (VF.isVector()) {
4026     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4027     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4028     VectorInit = Builder.CreateInsertElement(
4029         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4030         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
4031   }
4032 
4033   // We constructed a temporary phi node in the first phase of vectorization.
4034   // This phi node will eventually be deleted.
4035   Builder.SetInsertPoint(
4036       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4037 
4038   // Create a phi node for the new recurrence. The current value will either be
4039   // the initial value inserted into a vector or loop-varying vector value.
4040   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4041   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4042 
4043   // Get the vectorized previous value of the last part UF - 1. It appears last
4044   // among all unrolled iterations, due to the order of their construction.
4045   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4046 
4047   // Find and set the insertion point after the previous value if it is an
4048   // instruction.
4049   BasicBlock::iterator InsertPt;
4050   // Note that the previous value may have been constant-folded so it is not
4051   // guaranteed to be an instruction in the vector loop.
4052   // FIXME: Loop invariant values do not form recurrences. We should deal with
4053   //        them earlier.
4054   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4055     InsertPt = LoopVectorBody->getFirstInsertionPt();
4056   else {
4057     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4058     if (isa<PHINode>(PreviousLastPart))
4059       // If the previous value is a phi node, we should insert after all the phi
4060       // nodes in the block containing the PHI to avoid breaking basic block
4061       // verification. Note that the basic block may be different to
4062       // LoopVectorBody, in case we predicate the loop.
4063       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4064     else
4065       InsertPt = ++PreviousInst->getIterator();
4066   }
4067   Builder.SetInsertPoint(&*InsertPt);
4068 
4069   // We will construct a vector for the recurrence by combining the values for
4070   // the current and previous iterations. This is the required shuffle mask.
4071   assert(!VF.isScalable());
4072   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4073   ShuffleMask[0] = VF.getKnownMinValue() - 1;
4074   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4075     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4076 
4077   // The vector from which to take the initial value for the current iteration
4078   // (actual or unrolled). Initially, this is the vector phi node.
4079   Value *Incoming = VecPhi;
4080 
4081   // Shuffle the current and previous vector and update the vector parts.
4082   for (unsigned Part = 0; Part < UF; ++Part) {
4083     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4084     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4085     auto *Shuffle =
4086         VF.isVector()
4087             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4088             : Incoming;
4089     PhiPart->replaceAllUsesWith(Shuffle);
4090     cast<Instruction>(PhiPart)->eraseFromParent();
4091     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4092     Incoming = PreviousPart;
4093   }
4094 
4095   // Fix the latch value of the new recurrence in the vector loop.
4096   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4097 
4098   // Extract the last vector element in the middle block. This will be the
4099   // initial value for the recurrence when jumping to the scalar loop.
4100   auto *ExtractForScalar = Incoming;
4101   if (VF.isVector()) {
4102     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4103     ExtractForScalar = Builder.CreateExtractElement(
4104         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4105         "vector.recur.extract");
4106   }
4107   // Extract the second last element in the middle block if the
4108   // Phi is used outside the loop. We need to extract the phi itself
4109   // and not the last element (the phi update in the current iteration). This
4110   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4111   // when the scalar loop is not run at all.
4112   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4113   if (VF.isVector())
4114     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4115         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4116         "vector.recur.extract.for.phi");
4117   // When loop is unrolled without vectorizing, initialize
4118   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4119   // `Incoming`. This is analogous to the vectorized case above: extracting the
4120   // second last element when VF > 1.
4121   else if (UF > 1)
4122     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4123 
4124   // Fix the initial value of the original recurrence in the scalar loop.
4125   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4126   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4127   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4128     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4129     Start->addIncoming(Incoming, BB);
4130   }
4131 
4132   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4133   Phi->setName("scalar.recur");
4134 
4135   // Finally, fix users of the recurrence outside the loop. The users will need
4136   // either the last value of the scalar recurrence or the last value of the
4137   // vector recurrence we extracted in the middle block. Since the loop is in
4138   // LCSSA form, we just need to find all the phi nodes for the original scalar
4139   // recurrence in the exit block, and then add an edge for the middle block.
4140   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4141     if (LCSSAPhi.getIncomingValue(0) == Phi) {
4142       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4143     }
4144   }
4145 }
4146 
4147 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
4148   Constant *Zero = Builder.getInt32(0);
4149 
4150   // Get it's reduction variable descriptor.
4151   assert(Legal->isReductionVariable(Phi) &&
4152          "Unable to find the reduction variable");
4153   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4154 
4155   RecurKind RK = RdxDesc.getRecurrenceKind();
4156   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4157   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4158   setDebugLocFromInst(Builder, ReductionStartValue);
4159   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4160 
4161   // We need to generate a reduction vector from the incoming scalar.
4162   // To do so, we need to generate the 'identity' vector and override
4163   // one of the elements with the incoming scalar reduction. We need
4164   // to do it in the vector-loop preheader.
4165   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4166 
4167   // This is the vector-clone of the value that leaves the loop.
4168   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4169 
4170   // Find the reduction identity variable. Zero for addition, or, xor,
4171   // one for multiplication, -1 for And.
4172   Value *Identity;
4173   Value *VectorStart;
4174   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
4175     // MinMax reduction have the start value as their identify.
4176     if (VF.isScalar() || IsInLoopReductionPhi) {
4177       VectorStart = Identity = ReductionStartValue;
4178     } else {
4179       VectorStart = Identity =
4180         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
4181     }
4182   } else {
4183     // Handle other reduction kinds:
4184     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
4185         RK, VecTy->getScalarType());
4186     if (VF.isScalar() || IsInLoopReductionPhi) {
4187       Identity = Iden;
4188       // This vector is the Identity vector where the first element is the
4189       // incoming scalar reduction.
4190       VectorStart = ReductionStartValue;
4191     } else {
4192       Identity = ConstantVector::getSplat(VF, Iden);
4193 
4194       // This vector is the Identity vector where the first element is the
4195       // incoming scalar reduction.
4196       VectorStart =
4197         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
4198     }
4199   }
4200 
4201   // Wrap flags are in general invalid after vectorization, clear them.
4202   clearReductionWrapFlags(RdxDesc);
4203 
4204   // Fix the vector-loop phi.
4205 
4206   // Reductions do not have to start at zero. They can start with
4207   // any loop invariant values.
4208   BasicBlock *Latch = OrigLoop->getLoopLatch();
4209   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4210 
4211   for (unsigned Part = 0; Part < UF; ++Part) {
4212     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4213     Value *Val = getOrCreateVectorValue(LoopVal, Part);
4214     // Make sure to add the reduction start value only to the
4215     // first unroll part.
4216     Value *StartVal = (Part == 0) ? VectorStart : Identity;
4217     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
4218     cast<PHINode>(VecRdxPhi)
4219       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4220   }
4221 
4222   // Before each round, move the insertion point right between
4223   // the PHIs and the values we are going to write.
4224   // This allows us to write both PHINodes and the extractelement
4225   // instructions.
4226   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4227 
4228   setDebugLocFromInst(Builder, LoopExitInst);
4229 
4230   // If tail is folded by masking, the vector value to leave the loop should be
4231   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4232   // instead of the former. For an inloop reduction the reduction will already
4233   // be predicated, and does not need to be handled here.
4234   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4235     for (unsigned Part = 0; Part < UF; ++Part) {
4236       Value *VecLoopExitInst =
4237           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4238       Value *Sel = nullptr;
4239       for (User *U : VecLoopExitInst->users()) {
4240         if (isa<SelectInst>(U)) {
4241           assert(!Sel && "Reduction exit feeding two selects");
4242           Sel = U;
4243         } else
4244           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4245       }
4246       assert(Sel && "Reduction exit feeds no select");
4247       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4248 
4249       // If the target can create a predicated operator for the reduction at no
4250       // extra cost in the loop (for example a predicated vadd), it can be
4251       // cheaper for the select to remain in the loop than be sunk out of it,
4252       // and so use the select value for the phi instead of the old
4253       // LoopExitValue.
4254       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4255       if (PreferPredicatedReductionSelect ||
4256           TTI->preferPredicatedReductionSelect(
4257               RdxDesc.getRecurrenceBinOp(), Phi->getType(),
4258               TargetTransformInfo::ReductionFlags())) {
4259         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4260         VecRdxPhi->setIncomingValueForBlock(
4261             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4262       }
4263     }
4264   }
4265 
4266   // If the vector reduction can be performed in a smaller type, we truncate
4267   // then extend the loop exit value to enable InstCombine to evaluate the
4268   // entire expression in the smaller type.
4269   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4270     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4271     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4272     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4273     Builder.SetInsertPoint(
4274         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4275     VectorParts RdxParts(UF);
4276     for (unsigned Part = 0; Part < UF; ++Part) {
4277       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4278       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4279       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4280                                         : Builder.CreateZExt(Trunc, VecTy);
4281       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4282            UI != RdxParts[Part]->user_end();)
4283         if (*UI != Trunc) {
4284           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4285           RdxParts[Part] = Extnd;
4286         } else {
4287           ++UI;
4288         }
4289     }
4290     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4291     for (unsigned Part = 0; Part < UF; ++Part) {
4292       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4293       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4294     }
4295   }
4296 
4297   // Reduce all of the unrolled parts into a single vector.
4298   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4299   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4300 
4301   // The middle block terminator has already been assigned a DebugLoc here (the
4302   // OrigLoop's single latch terminator). We want the whole middle block to
4303   // appear to execute on this line because: (a) it is all compiler generated,
4304   // (b) these instructions are always executed after evaluating the latch
4305   // conditional branch, and (c) other passes may add new predecessors which
4306   // terminate on this line. This is the easiest way to ensure we don't
4307   // accidentally cause an extra step back into the loop while debugging.
4308   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4309   for (unsigned Part = 1; Part < UF; ++Part) {
4310     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4311     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4312       // Floating point operations had to be 'fast' to enable the reduction.
4313       ReducedPartRdx = addFastMathFlag(
4314           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4315                               ReducedPartRdx, "bin.rdx"),
4316           RdxDesc.getFastMathFlags());
4317     else
4318       ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4319   }
4320 
4321   // Create the reduction after the loop. Note that inloop reductions create the
4322   // target reduction in the loop using a Reduction recipe.
4323   if (VF.isVector() && !IsInLoopReductionPhi) {
4324     ReducedPartRdx =
4325         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4326     // If the reduction can be performed in a smaller type, we need to extend
4327     // the reduction to the wider type before we branch to the original loop.
4328     if (Phi->getType() != RdxDesc.getRecurrenceType())
4329       ReducedPartRdx =
4330         RdxDesc.isSigned()
4331         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4332         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4333   }
4334 
4335   // Create a phi node that merges control-flow from the backedge-taken check
4336   // block and the middle block.
4337   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4338                                         LoopScalarPreHeader->getTerminator());
4339   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4340     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4341   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4342 
4343   // Now, we need to fix the users of the reduction variable
4344   // inside and outside of the scalar remainder loop.
4345   // We know that the loop is in LCSSA form. We need to update the
4346   // PHI nodes in the exit blocks.
4347   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4348     // All PHINodes need to have a single entry edge, or two if
4349     // we already fixed them.
4350     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4351 
4352     // We found a reduction value exit-PHI. Update it with the
4353     // incoming bypass edge.
4354     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4355       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4356   } // end of the LCSSA phi scan.
4357 
4358     // Fix the scalar loop reduction variable with the incoming reduction sum
4359     // from the vector body and from the backedge value.
4360   int IncomingEdgeBlockIdx =
4361     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4362   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4363   // Pick the other block.
4364   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4365   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4366   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4367 }
4368 
4369 void InnerLoopVectorizer::clearReductionWrapFlags(
4370     RecurrenceDescriptor &RdxDesc) {
4371   RecurKind RK = RdxDesc.getRecurrenceKind();
4372   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4373     return;
4374 
4375   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4376   assert(LoopExitInstr && "null loop exit instruction");
4377   SmallVector<Instruction *, 8> Worklist;
4378   SmallPtrSet<Instruction *, 8> Visited;
4379   Worklist.push_back(LoopExitInstr);
4380   Visited.insert(LoopExitInstr);
4381 
4382   while (!Worklist.empty()) {
4383     Instruction *Cur = Worklist.pop_back_val();
4384     if (isa<OverflowingBinaryOperator>(Cur))
4385       for (unsigned Part = 0; Part < UF; ++Part) {
4386         Value *V = getOrCreateVectorValue(Cur, Part);
4387         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4388       }
4389 
4390     for (User *U : Cur->users()) {
4391       Instruction *UI = cast<Instruction>(U);
4392       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4393           Visited.insert(UI).second)
4394         Worklist.push_back(UI);
4395     }
4396   }
4397 }
4398 
4399 void InnerLoopVectorizer::fixLCSSAPHIs() {
4400   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4401     if (LCSSAPhi.getNumIncomingValues() == 1) {
4402       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4403       // Non-instruction incoming values will have only one value.
4404       unsigned LastLane = 0;
4405       if (isa<Instruction>(IncomingValue))
4406         LastLane = Cost->isUniformAfterVectorization(
4407                        cast<Instruction>(IncomingValue), VF)
4408                        ? 0
4409                        : VF.getKnownMinValue() - 1;
4410       assert((!VF.isScalable() || LastLane == 0) &&
4411              "scalable vectors dont support non-uniform scalars yet");
4412       // Can be a loop invariant incoming value or the last scalar value to be
4413       // extracted from the vectorized loop.
4414       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4415       Value *lastIncomingValue =
4416           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4417       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4418     }
4419   }
4420 }
4421 
4422 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4423   // The basic block and loop containing the predicated instruction.
4424   auto *PredBB = PredInst->getParent();
4425   auto *VectorLoop = LI->getLoopFor(PredBB);
4426 
4427   // Initialize a worklist with the operands of the predicated instruction.
4428   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4429 
4430   // Holds instructions that we need to analyze again. An instruction may be
4431   // reanalyzed if we don't yet know if we can sink it or not.
4432   SmallVector<Instruction *, 8> InstsToReanalyze;
4433 
4434   // Returns true if a given use occurs in the predicated block. Phi nodes use
4435   // their operands in their corresponding predecessor blocks.
4436   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4437     auto *I = cast<Instruction>(U.getUser());
4438     BasicBlock *BB = I->getParent();
4439     if (auto *Phi = dyn_cast<PHINode>(I))
4440       BB = Phi->getIncomingBlock(
4441           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4442     return BB == PredBB;
4443   };
4444 
4445   // Iteratively sink the scalarized operands of the predicated instruction
4446   // into the block we created for it. When an instruction is sunk, it's
4447   // operands are then added to the worklist. The algorithm ends after one pass
4448   // through the worklist doesn't sink a single instruction.
4449   bool Changed;
4450   do {
4451     // Add the instructions that need to be reanalyzed to the worklist, and
4452     // reset the changed indicator.
4453     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4454     InstsToReanalyze.clear();
4455     Changed = false;
4456 
4457     while (!Worklist.empty()) {
4458       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4459 
4460       // We can't sink an instruction if it is a phi node, is already in the
4461       // predicated block, is not in the loop, or may have side effects.
4462       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4463           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4464         continue;
4465 
4466       // It's legal to sink the instruction if all its uses occur in the
4467       // predicated block. Otherwise, there's nothing to do yet, and we may
4468       // need to reanalyze the instruction.
4469       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4470         InstsToReanalyze.push_back(I);
4471         continue;
4472       }
4473 
4474       // Move the instruction to the beginning of the predicated block, and add
4475       // it's operands to the worklist.
4476       I->moveBefore(&*PredBB->getFirstInsertionPt());
4477       Worklist.insert(I->op_begin(), I->op_end());
4478 
4479       // The sinking may have enabled other instructions to be sunk, so we will
4480       // need to iterate.
4481       Changed = true;
4482     }
4483   } while (Changed);
4484 }
4485 
4486 void InnerLoopVectorizer::fixNonInductionPHIs() {
4487   for (PHINode *OrigPhi : OrigPHIsToFix) {
4488     PHINode *NewPhi =
4489         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4490     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4491 
4492     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4493         predecessors(OrigPhi->getParent()));
4494     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4495         predecessors(NewPhi->getParent()));
4496     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4497            "Scalar and Vector BB should have the same number of predecessors");
4498 
4499     // The insertion point in Builder may be invalidated by the time we get
4500     // here. Force the Builder insertion point to something valid so that we do
4501     // not run into issues during insertion point restore in
4502     // getOrCreateVectorValue calls below.
4503     Builder.SetInsertPoint(NewPhi);
4504 
4505     // The predecessor order is preserved and we can rely on mapping between
4506     // scalar and vector block predecessors.
4507     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4508       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4509 
4510       // When looking up the new scalar/vector values to fix up, use incoming
4511       // values from original phi.
4512       Value *ScIncV =
4513           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4514 
4515       // Scalar incoming value may need a broadcast
4516       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4517       NewPhi->addIncoming(NewIncV, NewPredBB);
4518     }
4519   }
4520 }
4521 
4522 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4523                                    VPUser &Operands, unsigned UF,
4524                                    ElementCount VF, bool IsPtrLoopInvariant,
4525                                    SmallBitVector &IsIndexLoopInvariant,
4526                                    VPTransformState &State) {
4527   // Construct a vector GEP by widening the operands of the scalar GEP as
4528   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4529   // results in a vector of pointers when at least one operand of the GEP
4530   // is vector-typed. Thus, to keep the representation compact, we only use
4531   // vector-typed operands for loop-varying values.
4532 
4533   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4534     // If we are vectorizing, but the GEP has only loop-invariant operands,
4535     // the GEP we build (by only using vector-typed operands for
4536     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4537     // produce a vector of pointers, we need to either arbitrarily pick an
4538     // operand to broadcast, or broadcast a clone of the original GEP.
4539     // Here, we broadcast a clone of the original.
4540     //
4541     // TODO: If at some point we decide to scalarize instructions having
4542     //       loop-invariant operands, this special case will no longer be
4543     //       required. We would add the scalarization decision to
4544     //       collectLoopScalars() and teach getVectorValue() to broadcast
4545     //       the lane-zero scalar value.
4546     auto *Clone = Builder.Insert(GEP->clone());
4547     for (unsigned Part = 0; Part < UF; ++Part) {
4548       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4549       State.set(VPDef, GEP, EntryPart, Part);
4550       addMetadata(EntryPart, GEP);
4551     }
4552   } else {
4553     // If the GEP has at least one loop-varying operand, we are sure to
4554     // produce a vector of pointers. But if we are only unrolling, we want
4555     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4556     // produce with the code below will be scalar (if VF == 1) or vector
4557     // (otherwise). Note that for the unroll-only case, we still maintain
4558     // values in the vector mapping with initVector, as we do for other
4559     // instructions.
4560     for (unsigned Part = 0; Part < UF; ++Part) {
4561       // The pointer operand of the new GEP. If it's loop-invariant, we
4562       // won't broadcast it.
4563       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4564                                      : State.get(Operands.getOperand(0), Part);
4565 
4566       // Collect all the indices for the new GEP. If any index is
4567       // loop-invariant, we won't broadcast it.
4568       SmallVector<Value *, 4> Indices;
4569       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4570         VPValue *Operand = Operands.getOperand(I);
4571         if (IsIndexLoopInvariant[I - 1])
4572           Indices.push_back(State.get(Operand, {0, 0}));
4573         else
4574           Indices.push_back(State.get(Operand, Part));
4575       }
4576 
4577       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4578       // but it should be a vector, otherwise.
4579       auto *NewGEP =
4580           GEP->isInBounds()
4581               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4582                                           Indices)
4583               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4584       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4585              "NewGEP is not a pointer vector");
4586       State.set(VPDef, GEP, NewGEP, Part);
4587       addMetadata(NewGEP, GEP);
4588     }
4589   }
4590 }
4591 
4592 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4593                                               ElementCount VF) {
4594   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4595   PHINode *P = cast<PHINode>(PN);
4596   if (EnableVPlanNativePath) {
4597     // Currently we enter here in the VPlan-native path for non-induction
4598     // PHIs where all control flow is uniform. We simply widen these PHIs.
4599     // Create a vector phi with no operands - the vector phi operands will be
4600     // set at the end of vector code generation.
4601     Type *VecTy =
4602         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4603     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4604     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4605     OrigPHIsToFix.push_back(P);
4606 
4607     return;
4608   }
4609 
4610   assert(PN->getParent() == OrigLoop->getHeader() &&
4611          "Non-header phis should have been handled elsewhere");
4612 
4613   // In order to support recurrences we need to be able to vectorize Phi nodes.
4614   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4615   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4616   // this value when we vectorize all of the instructions that use the PHI.
4617   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4618     for (unsigned Part = 0; Part < UF; ++Part) {
4619       // This is phase one of vectorizing PHIs.
4620       bool ScalarPHI =
4621           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4622       Type *VecTy =
4623           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4624       Value *EntryPart = PHINode::Create(
4625           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4626       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4627     }
4628     return;
4629   }
4630 
4631   setDebugLocFromInst(Builder, P);
4632 
4633   // This PHINode must be an induction variable.
4634   // Make sure that we know about it.
4635   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4636 
4637   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4638   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4639 
4640   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4641   // which can be found from the original scalar operations.
4642   switch (II.getKind()) {
4643   case InductionDescriptor::IK_NoInduction:
4644     llvm_unreachable("Unknown induction");
4645   case InductionDescriptor::IK_IntInduction:
4646   case InductionDescriptor::IK_FpInduction:
4647     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4648   case InductionDescriptor::IK_PtrInduction: {
4649     // Handle the pointer induction variable case.
4650     assert(P->getType()->isPointerTy() && "Unexpected type.");
4651 
4652     if (Cost->isScalarAfterVectorization(P, VF)) {
4653       // This is the normalized GEP that starts counting at zero.
4654       Value *PtrInd =
4655           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4656       // Determine the number of scalars we need to generate for each unroll
4657       // iteration. If the instruction is uniform, we only need to generate the
4658       // first lane. Otherwise, we generate all VF values.
4659       unsigned Lanes =
4660           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4661       for (unsigned Part = 0; Part < UF; ++Part) {
4662         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4663           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4664                                            Lane + Part * VF.getKnownMinValue());
4665           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4666           Value *SclrGep =
4667               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4668           SclrGep->setName("next.gep");
4669           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4670         }
4671       }
4672       return;
4673     }
4674     assert(isa<SCEVConstant>(II.getStep()) &&
4675            "Induction step not a SCEV constant!");
4676     Type *PhiType = II.getStep()->getType();
4677 
4678     // Build a pointer phi
4679     Value *ScalarStartValue = II.getStartValue();
4680     Type *ScStValueType = ScalarStartValue->getType();
4681     PHINode *NewPointerPhi =
4682         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4683     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4684 
4685     // A pointer induction, performed by using a gep
4686     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4687     Instruction *InductionLoc = LoopLatch->getTerminator();
4688     const SCEV *ScalarStep = II.getStep();
4689     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4690     Value *ScalarStepValue =
4691         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4692     Value *InductionGEP = GetElementPtrInst::Create(
4693         ScStValueType->getPointerElementType(), NewPointerPhi,
4694         Builder.CreateMul(
4695             ScalarStepValue,
4696             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4697         "ptr.ind", InductionLoc);
4698     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4699 
4700     // Create UF many actual address geps that use the pointer
4701     // phi as base and a vectorized version of the step value
4702     // (<step*0, ..., step*N>) as offset.
4703     for (unsigned Part = 0; Part < UF; ++Part) {
4704       SmallVector<Constant *, 8> Indices;
4705       // Create a vector of consecutive numbers from zero to VF.
4706       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4707         Indices.push_back(
4708             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4709       Constant *StartOffset = ConstantVector::get(Indices);
4710 
4711       Value *GEP = Builder.CreateGEP(
4712           ScStValueType->getPointerElementType(), NewPointerPhi,
4713           Builder.CreateMul(
4714               StartOffset,
4715               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4716               "vector.gep"));
4717       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4718     }
4719   }
4720   }
4721 }
4722 
4723 /// A helper function for checking whether an integer division-related
4724 /// instruction may divide by zero (in which case it must be predicated if
4725 /// executed conditionally in the scalar code).
4726 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4727 /// Non-zero divisors that are non compile-time constants will not be
4728 /// converted into multiplication, so we will still end up scalarizing
4729 /// the division, but can do so w/o predication.
4730 static bool mayDivideByZero(Instruction &I) {
4731   assert((I.getOpcode() == Instruction::UDiv ||
4732           I.getOpcode() == Instruction::SDiv ||
4733           I.getOpcode() == Instruction::URem ||
4734           I.getOpcode() == Instruction::SRem) &&
4735          "Unexpected instruction");
4736   Value *Divisor = I.getOperand(1);
4737   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4738   return !CInt || CInt->isZero();
4739 }
4740 
4741 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4742                                            VPUser &User,
4743                                            VPTransformState &State) {
4744   switch (I.getOpcode()) {
4745   case Instruction::Call:
4746   case Instruction::Br:
4747   case Instruction::PHI:
4748   case Instruction::GetElementPtr:
4749   case Instruction::Select:
4750     llvm_unreachable("This instruction is handled by a different recipe.");
4751   case Instruction::UDiv:
4752   case Instruction::SDiv:
4753   case Instruction::SRem:
4754   case Instruction::URem:
4755   case Instruction::Add:
4756   case Instruction::FAdd:
4757   case Instruction::Sub:
4758   case Instruction::FSub:
4759   case Instruction::FNeg:
4760   case Instruction::Mul:
4761   case Instruction::FMul:
4762   case Instruction::FDiv:
4763   case Instruction::FRem:
4764   case Instruction::Shl:
4765   case Instruction::LShr:
4766   case Instruction::AShr:
4767   case Instruction::And:
4768   case Instruction::Or:
4769   case Instruction::Xor: {
4770     // Just widen unops and binops.
4771     setDebugLocFromInst(Builder, &I);
4772 
4773     for (unsigned Part = 0; Part < UF; ++Part) {
4774       SmallVector<Value *, 2> Ops;
4775       for (VPValue *VPOp : User.operands())
4776         Ops.push_back(State.get(VPOp, Part));
4777 
4778       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4779 
4780       if (auto *VecOp = dyn_cast<Instruction>(V))
4781         VecOp->copyIRFlags(&I);
4782 
4783       // Use this vector value for all users of the original instruction.
4784       State.set(Def, &I, V, Part);
4785       addMetadata(V, &I);
4786     }
4787 
4788     break;
4789   }
4790   case Instruction::ICmp:
4791   case Instruction::FCmp: {
4792     // Widen compares. Generate vector compares.
4793     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4794     auto *Cmp = cast<CmpInst>(&I);
4795     setDebugLocFromInst(Builder, Cmp);
4796     for (unsigned Part = 0; Part < UF; ++Part) {
4797       Value *A = State.get(User.getOperand(0), Part);
4798       Value *B = State.get(User.getOperand(1), Part);
4799       Value *C = nullptr;
4800       if (FCmp) {
4801         // Propagate fast math flags.
4802         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4803         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4804         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4805       } else {
4806         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4807       }
4808       State.set(Def, &I, C, Part);
4809       addMetadata(C, &I);
4810     }
4811 
4812     break;
4813   }
4814 
4815   case Instruction::ZExt:
4816   case Instruction::SExt:
4817   case Instruction::FPToUI:
4818   case Instruction::FPToSI:
4819   case Instruction::FPExt:
4820   case Instruction::PtrToInt:
4821   case Instruction::IntToPtr:
4822   case Instruction::SIToFP:
4823   case Instruction::UIToFP:
4824   case Instruction::Trunc:
4825   case Instruction::FPTrunc:
4826   case Instruction::BitCast: {
4827     auto *CI = cast<CastInst>(&I);
4828     setDebugLocFromInst(Builder, CI);
4829 
4830     /// Vectorize casts.
4831     Type *DestTy =
4832         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4833 
4834     for (unsigned Part = 0; Part < UF; ++Part) {
4835       Value *A = State.get(User.getOperand(0), Part);
4836       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4837       State.set(Def, &I, Cast, Part);
4838       addMetadata(Cast, &I);
4839     }
4840     break;
4841   }
4842   default:
4843     // This instruction is not vectorized by simple widening.
4844     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4845     llvm_unreachable("Unhandled instruction!");
4846   } // end of switch.
4847 }
4848 
4849 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4850                                                VPUser &ArgOperands,
4851                                                VPTransformState &State) {
4852   assert(!isa<DbgInfoIntrinsic>(I) &&
4853          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4854   setDebugLocFromInst(Builder, &I);
4855 
4856   Module *M = I.getParent()->getParent()->getParent();
4857   auto *CI = cast<CallInst>(&I);
4858 
4859   SmallVector<Type *, 4> Tys;
4860   for (Value *ArgOperand : CI->arg_operands())
4861     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4862 
4863   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4864 
4865   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4866   // version of the instruction.
4867   // Is it beneficial to perform intrinsic call compared to lib call?
4868   bool NeedToScalarize = false;
4869   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4870   bool UseVectorIntrinsic =
4871       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4872   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4873          "Instruction should be scalarized elsewhere.");
4874 
4875   for (unsigned Part = 0; Part < UF; ++Part) {
4876     SmallVector<Value *, 4> Args;
4877     for (auto &I : enumerate(ArgOperands.operands())) {
4878       // Some intrinsics have a scalar argument - don't replace it with a
4879       // vector.
4880       Value *Arg;
4881       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4882         Arg = State.get(I.value(), Part);
4883       else
4884         Arg = State.get(I.value(), {0, 0});
4885       Args.push_back(Arg);
4886     }
4887 
4888     Function *VectorF;
4889     if (UseVectorIntrinsic) {
4890       // Use vector version of the intrinsic.
4891       Type *TysForDecl[] = {CI->getType()};
4892       if (VF.isVector()) {
4893         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4894         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4895       }
4896       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4897       assert(VectorF && "Can't retrieve vector intrinsic.");
4898     } else {
4899       // Use vector version of the function call.
4900       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4901 #ifndef NDEBUG
4902       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4903              "Can't create vector function.");
4904 #endif
4905         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4906     }
4907       SmallVector<OperandBundleDef, 1> OpBundles;
4908       CI->getOperandBundlesAsDefs(OpBundles);
4909       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4910 
4911       if (isa<FPMathOperator>(V))
4912         V->copyFastMathFlags(CI);
4913 
4914       State.set(Def, &I, V, Part);
4915       addMetadata(V, &I);
4916   }
4917 }
4918 
4919 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4920                                                  VPUser &Operands,
4921                                                  bool InvariantCond,
4922                                                  VPTransformState &State) {
4923   setDebugLocFromInst(Builder, &I);
4924 
4925   // The condition can be loop invariant  but still defined inside the
4926   // loop. This means that we can't just use the original 'cond' value.
4927   // We have to take the 'vectorized' value and pick the first lane.
4928   // Instcombine will make this a no-op.
4929   auto *InvarCond =
4930       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4931 
4932   for (unsigned Part = 0; Part < UF; ++Part) {
4933     Value *Cond =
4934         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4935     Value *Op0 = State.get(Operands.getOperand(1), Part);
4936     Value *Op1 = State.get(Operands.getOperand(2), Part);
4937     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4938     State.set(VPDef, &I, Sel, Part);
4939     addMetadata(Sel, &I);
4940   }
4941 }
4942 
4943 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4944   // We should not collect Scalars more than once per VF. Right now, this
4945   // function is called from collectUniformsAndScalars(), which already does
4946   // this check. Collecting Scalars for VF=1 does not make any sense.
4947   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4948          "This function should not be visited twice for the same VF");
4949 
4950   SmallSetVector<Instruction *, 8> Worklist;
4951 
4952   // These sets are used to seed the analysis with pointers used by memory
4953   // accesses that will remain scalar.
4954   SmallSetVector<Instruction *, 8> ScalarPtrs;
4955   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4956   auto *Latch = TheLoop->getLoopLatch();
4957 
4958   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4959   // The pointer operands of loads and stores will be scalar as long as the
4960   // memory access is not a gather or scatter operation. The value operand of a
4961   // store will remain scalar if the store is scalarized.
4962   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4963     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4964     assert(WideningDecision != CM_Unknown &&
4965            "Widening decision should be ready at this moment");
4966     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4967       if (Ptr == Store->getValueOperand())
4968         return WideningDecision == CM_Scalarize;
4969     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4970            "Ptr is neither a value or pointer operand");
4971     return WideningDecision != CM_GatherScatter;
4972   };
4973 
4974   // A helper that returns true if the given value is a bitcast or
4975   // getelementptr instruction contained in the loop.
4976   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4977     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4978             isa<GetElementPtrInst>(V)) &&
4979            !TheLoop->isLoopInvariant(V);
4980   };
4981 
4982   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4983     if (!isa<PHINode>(Ptr) ||
4984         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4985       return false;
4986     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4987     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4988       return false;
4989     return isScalarUse(MemAccess, Ptr);
4990   };
4991 
4992   // A helper that evaluates a memory access's use of a pointer. If the
4993   // pointer is actually the pointer induction of a loop, it is being
4994   // inserted into Worklist. If the use will be a scalar use, and the
4995   // pointer is only used by memory accesses, we place the pointer in
4996   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4997   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4998     if (isScalarPtrInduction(MemAccess, Ptr)) {
4999       Worklist.insert(cast<Instruction>(Ptr));
5000       Instruction *Update = cast<Instruction>(
5001           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5002       Worklist.insert(Update);
5003       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
5004                         << "\n");
5005       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
5006                         << "\n");
5007       return;
5008     }
5009     // We only care about bitcast and getelementptr instructions contained in
5010     // the loop.
5011     if (!isLoopVaryingBitCastOrGEP(Ptr))
5012       return;
5013 
5014     // If the pointer has already been identified as scalar (e.g., if it was
5015     // also identified as uniform), there's nothing to do.
5016     auto *I = cast<Instruction>(Ptr);
5017     if (Worklist.count(I))
5018       return;
5019 
5020     // If the use of the pointer will be a scalar use, and all users of the
5021     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5022     // place the pointer in PossibleNonScalarPtrs.
5023     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5024           return isa<LoadInst>(U) || isa<StoreInst>(U);
5025         }))
5026       ScalarPtrs.insert(I);
5027     else
5028       PossibleNonScalarPtrs.insert(I);
5029   };
5030 
5031   // We seed the scalars analysis with three classes of instructions: (1)
5032   // instructions marked uniform-after-vectorization and (2) bitcast,
5033   // getelementptr and (pointer) phi instructions used by memory accesses
5034   // requiring a scalar use.
5035   //
5036   // (1) Add to the worklist all instructions that have been identified as
5037   // uniform-after-vectorization.
5038   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5039 
5040   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5041   // memory accesses requiring a scalar use. The pointer operands of loads and
5042   // stores will be scalar as long as the memory accesses is not a gather or
5043   // scatter operation. The value operand of a store will remain scalar if the
5044   // store is scalarized.
5045   for (auto *BB : TheLoop->blocks())
5046     for (auto &I : *BB) {
5047       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5048         evaluatePtrUse(Load, Load->getPointerOperand());
5049       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5050         evaluatePtrUse(Store, Store->getPointerOperand());
5051         evaluatePtrUse(Store, Store->getValueOperand());
5052       }
5053     }
5054   for (auto *I : ScalarPtrs)
5055     if (!PossibleNonScalarPtrs.count(I)) {
5056       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5057       Worklist.insert(I);
5058     }
5059 
5060   // Insert the forced scalars.
5061   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5062   // induction variable when the PHI user is scalarized.
5063   auto ForcedScalar = ForcedScalars.find(VF);
5064   if (ForcedScalar != ForcedScalars.end())
5065     for (auto *I : ForcedScalar->second)
5066       Worklist.insert(I);
5067 
5068   // Expand the worklist by looking through any bitcasts and getelementptr
5069   // instructions we've already identified as scalar. This is similar to the
5070   // expansion step in collectLoopUniforms(); however, here we're only
5071   // expanding to include additional bitcasts and getelementptr instructions.
5072   unsigned Idx = 0;
5073   while (Idx != Worklist.size()) {
5074     Instruction *Dst = Worklist[Idx++];
5075     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5076       continue;
5077     auto *Src = cast<Instruction>(Dst->getOperand(0));
5078     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5079           auto *J = cast<Instruction>(U);
5080           return !TheLoop->contains(J) || Worklist.count(J) ||
5081                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5082                   isScalarUse(J, Src));
5083         })) {
5084       Worklist.insert(Src);
5085       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5086     }
5087   }
5088 
5089   // An induction variable will remain scalar if all users of the induction
5090   // variable and induction variable update remain scalar.
5091   for (auto &Induction : Legal->getInductionVars()) {
5092     auto *Ind = Induction.first;
5093     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5094 
5095     // If tail-folding is applied, the primary induction variable will be used
5096     // to feed a vector compare.
5097     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5098       continue;
5099 
5100     // Determine if all users of the induction variable are scalar after
5101     // vectorization.
5102     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5103       auto *I = cast<Instruction>(U);
5104       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5105     });
5106     if (!ScalarInd)
5107       continue;
5108 
5109     // Determine if all users of the induction variable update instruction are
5110     // scalar after vectorization.
5111     auto ScalarIndUpdate =
5112         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5113           auto *I = cast<Instruction>(U);
5114           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5115         });
5116     if (!ScalarIndUpdate)
5117       continue;
5118 
5119     // The induction variable and its update instruction will remain scalar.
5120     Worklist.insert(Ind);
5121     Worklist.insert(IndUpdate);
5122     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5123     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5124                       << "\n");
5125   }
5126 
5127   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5128 }
5129 
5130 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
5131                                                          ElementCount VF) {
5132   if (!blockNeedsPredication(I->getParent()))
5133     return false;
5134   switch(I->getOpcode()) {
5135   default:
5136     break;
5137   case Instruction::Load:
5138   case Instruction::Store: {
5139     if (!Legal->isMaskRequired(I))
5140       return false;
5141     auto *Ptr = getLoadStorePointerOperand(I);
5142     auto *Ty = getMemInstValueType(I);
5143     // We have already decided how to vectorize this instruction, get that
5144     // result.
5145     if (VF.isVector()) {
5146       InstWidening WideningDecision = getWideningDecision(I, VF);
5147       assert(WideningDecision != CM_Unknown &&
5148              "Widening decision should be ready at this moment");
5149       return WideningDecision == CM_Scalarize;
5150     }
5151     const Align Alignment = getLoadStoreAlignment(I);
5152     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5153                                 isLegalMaskedGather(Ty, Alignment))
5154                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5155                                 isLegalMaskedScatter(Ty, Alignment));
5156   }
5157   case Instruction::UDiv:
5158   case Instruction::SDiv:
5159   case Instruction::SRem:
5160   case Instruction::URem:
5161     return mayDivideByZero(*I);
5162   }
5163   return false;
5164 }
5165 
5166 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5167     Instruction *I, ElementCount VF) {
5168   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5169   assert(getWideningDecision(I, VF) == CM_Unknown &&
5170          "Decision should not be set yet.");
5171   auto *Group = getInterleavedAccessGroup(I);
5172   assert(Group && "Must have a group.");
5173 
5174   // If the instruction's allocated size doesn't equal it's type size, it
5175   // requires padding and will be scalarized.
5176   auto &DL = I->getModule()->getDataLayout();
5177   auto *ScalarTy = getMemInstValueType(I);
5178   if (hasIrregularType(ScalarTy, DL, VF))
5179     return false;
5180 
5181   // Check if masking is required.
5182   // A Group may need masking for one of two reasons: it resides in a block that
5183   // needs predication, or it was decided to use masking to deal with gaps.
5184   bool PredicatedAccessRequiresMasking =
5185       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5186   bool AccessWithGapsRequiresMasking =
5187       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5188   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5189     return true;
5190 
5191   // If masked interleaving is required, we expect that the user/target had
5192   // enabled it, because otherwise it either wouldn't have been created or
5193   // it should have been invalidated by the CostModel.
5194   assert(useMaskedInterleavedAccesses(TTI) &&
5195          "Masked interleave-groups for predicated accesses are not enabled.");
5196 
5197   auto *Ty = getMemInstValueType(I);
5198   const Align Alignment = getLoadStoreAlignment(I);
5199   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5200                           : TTI.isLegalMaskedStore(Ty, Alignment);
5201 }
5202 
5203 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5204     Instruction *I, ElementCount VF) {
5205   // Get and ensure we have a valid memory instruction.
5206   LoadInst *LI = dyn_cast<LoadInst>(I);
5207   StoreInst *SI = dyn_cast<StoreInst>(I);
5208   assert((LI || SI) && "Invalid memory instruction");
5209 
5210   auto *Ptr = getLoadStorePointerOperand(I);
5211 
5212   // In order to be widened, the pointer should be consecutive, first of all.
5213   if (!Legal->isConsecutivePtr(Ptr))
5214     return false;
5215 
5216   // If the instruction is a store located in a predicated block, it will be
5217   // scalarized.
5218   if (isScalarWithPredication(I))
5219     return false;
5220 
5221   // If the instruction's allocated size doesn't equal it's type size, it
5222   // requires padding and will be scalarized.
5223   auto &DL = I->getModule()->getDataLayout();
5224   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5225   if (hasIrregularType(ScalarTy, DL, VF))
5226     return false;
5227 
5228   return true;
5229 }
5230 
5231 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5232   // We should not collect Uniforms more than once per VF. Right now,
5233   // this function is called from collectUniformsAndScalars(), which
5234   // already does this check. Collecting Uniforms for VF=1 does not make any
5235   // sense.
5236 
5237   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5238          "This function should not be visited twice for the same VF");
5239 
5240   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5241   // not analyze again.  Uniforms.count(VF) will return 1.
5242   Uniforms[VF].clear();
5243 
5244   // We now know that the loop is vectorizable!
5245   // Collect instructions inside the loop that will remain uniform after
5246   // vectorization.
5247 
5248   // Global values, params and instructions outside of current loop are out of
5249   // scope.
5250   auto isOutOfScope = [&](Value *V) -> bool {
5251     Instruction *I = dyn_cast<Instruction>(V);
5252     return (!I || !TheLoop->contains(I));
5253   };
5254 
5255   SetVector<Instruction *> Worklist;
5256   BasicBlock *Latch = TheLoop->getLoopLatch();
5257 
5258   // Instructions that are scalar with predication must not be considered
5259   // uniform after vectorization, because that would create an erroneous
5260   // replicating region where only a single instance out of VF should be formed.
5261   // TODO: optimize such seldom cases if found important, see PR40816.
5262   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5263     if (isOutOfScope(I)) {
5264       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5265                         << *I << "\n");
5266       return;
5267     }
5268     if (isScalarWithPredication(I, VF)) {
5269       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5270                         << *I << "\n");
5271       return;
5272     }
5273     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5274     Worklist.insert(I);
5275   };
5276 
5277   // Start with the conditional branch. If the branch condition is an
5278   // instruction contained in the loop that is only used by the branch, it is
5279   // uniform.
5280   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5281   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5282     addToWorklistIfAllowed(Cmp);
5283 
5284   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5285     InstWidening WideningDecision = getWideningDecision(I, VF);
5286     assert(WideningDecision != CM_Unknown &&
5287            "Widening decision should be ready at this moment");
5288 
5289     // A uniform memory op is itself uniform.  We exclude uniform stores
5290     // here as they demand the last lane, not the first one.
5291     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5292       assert(WideningDecision == CM_Scalarize);
5293       return true;
5294     }
5295 
5296     return (WideningDecision == CM_Widen ||
5297             WideningDecision == CM_Widen_Reverse ||
5298             WideningDecision == CM_Interleave);
5299   };
5300 
5301 
5302   // Returns true if Ptr is the pointer operand of a memory access instruction
5303   // I, and I is known to not require scalarization.
5304   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5305     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5306   };
5307 
5308   // Holds a list of values which are known to have at least one uniform use.
5309   // Note that there may be other uses which aren't uniform.  A "uniform use"
5310   // here is something which only demands lane 0 of the unrolled iterations;
5311   // it does not imply that all lanes produce the same value (e.g. this is not
5312   // the usual meaning of uniform)
5313   SmallPtrSet<Value *, 8> HasUniformUse;
5314 
5315   // Scan the loop for instructions which are either a) known to have only
5316   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5317   for (auto *BB : TheLoop->blocks())
5318     for (auto &I : *BB) {
5319       // If there's no pointer operand, there's nothing to do.
5320       auto *Ptr = getLoadStorePointerOperand(&I);
5321       if (!Ptr)
5322         continue;
5323 
5324       // A uniform memory op is itself uniform.  We exclude uniform stores
5325       // here as they demand the last lane, not the first one.
5326       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5327         addToWorklistIfAllowed(&I);
5328 
5329       if (isUniformDecision(&I, VF)) {
5330         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5331         HasUniformUse.insert(Ptr);
5332       }
5333     }
5334 
5335   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5336   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5337   // disallows uses outside the loop as well.
5338   for (auto *V : HasUniformUse) {
5339     if (isOutOfScope(V))
5340       continue;
5341     auto *I = cast<Instruction>(V);
5342     auto UsersAreMemAccesses =
5343       llvm::all_of(I->users(), [&](User *U) -> bool {
5344         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5345       });
5346     if (UsersAreMemAccesses)
5347       addToWorklistIfAllowed(I);
5348   }
5349 
5350   // Expand Worklist in topological order: whenever a new instruction
5351   // is added , its users should be already inside Worklist.  It ensures
5352   // a uniform instruction will only be used by uniform instructions.
5353   unsigned idx = 0;
5354   while (idx != Worklist.size()) {
5355     Instruction *I = Worklist[idx++];
5356 
5357     for (auto OV : I->operand_values()) {
5358       // isOutOfScope operands cannot be uniform instructions.
5359       if (isOutOfScope(OV))
5360         continue;
5361       // First order recurrence Phi's should typically be considered
5362       // non-uniform.
5363       auto *OP = dyn_cast<PHINode>(OV);
5364       if (OP && Legal->isFirstOrderRecurrence(OP))
5365         continue;
5366       // If all the users of the operand are uniform, then add the
5367       // operand into the uniform worklist.
5368       auto *OI = cast<Instruction>(OV);
5369       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5370             auto *J = cast<Instruction>(U);
5371             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5372           }))
5373         addToWorklistIfAllowed(OI);
5374     }
5375   }
5376 
5377   // For an instruction to be added into Worklist above, all its users inside
5378   // the loop should also be in Worklist. However, this condition cannot be
5379   // true for phi nodes that form a cyclic dependence. We must process phi
5380   // nodes separately. An induction variable will remain uniform if all users
5381   // of the induction variable and induction variable update remain uniform.
5382   // The code below handles both pointer and non-pointer induction variables.
5383   for (auto &Induction : Legal->getInductionVars()) {
5384     auto *Ind = Induction.first;
5385     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5386 
5387     // Determine if all users of the induction variable are uniform after
5388     // vectorization.
5389     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5390       auto *I = cast<Instruction>(U);
5391       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5392              isVectorizedMemAccessUse(I, Ind);
5393     });
5394     if (!UniformInd)
5395       continue;
5396 
5397     // Determine if all users of the induction variable update instruction are
5398     // uniform after vectorization.
5399     auto UniformIndUpdate =
5400         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5401           auto *I = cast<Instruction>(U);
5402           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5403                  isVectorizedMemAccessUse(I, IndUpdate);
5404         });
5405     if (!UniformIndUpdate)
5406       continue;
5407 
5408     // The induction variable and its update instruction will remain uniform.
5409     addToWorklistIfAllowed(Ind);
5410     addToWorklistIfAllowed(IndUpdate);
5411   }
5412 
5413   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5414 }
5415 
5416 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5417   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5418 
5419   if (Legal->getRuntimePointerChecking()->Need) {
5420     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5421         "runtime pointer checks needed. Enable vectorization of this "
5422         "loop with '#pragma clang loop vectorize(enable)' when "
5423         "compiling with -Os/-Oz",
5424         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5425     return true;
5426   }
5427 
5428   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5429     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5430         "runtime SCEV checks needed. Enable vectorization of this "
5431         "loop with '#pragma clang loop vectorize(enable)' when "
5432         "compiling with -Os/-Oz",
5433         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5434     return true;
5435   }
5436 
5437   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5438   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5439     reportVectorizationFailure("Runtime stride check for small trip count",
5440         "runtime stride == 1 checks needed. Enable vectorization of "
5441         "this loop without such check by compiling with -Os/-Oz",
5442         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5443     return true;
5444   }
5445 
5446   return false;
5447 }
5448 
5449 Optional<ElementCount>
5450 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5451   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5452     // TODO: It may by useful to do since it's still likely to be dynamically
5453     // uniform if the target can skip.
5454     reportVectorizationFailure(
5455         "Not inserting runtime ptr check for divergent target",
5456         "runtime pointer checks needed. Not enabled for divergent target",
5457         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5458     return None;
5459   }
5460 
5461   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5462   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5463   if (TC == 1) {
5464     reportVectorizationFailure("Single iteration (non) loop",
5465         "loop trip count is one, irrelevant for vectorization",
5466         "SingleIterationLoop", ORE, TheLoop);
5467     return None;
5468   }
5469 
5470   ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5471 
5472   switch (ScalarEpilogueStatus) {
5473   case CM_ScalarEpilogueAllowed:
5474     return MaxVF;
5475   case CM_ScalarEpilogueNotAllowedUsePredicate:
5476     LLVM_FALLTHROUGH;
5477   case CM_ScalarEpilogueNotNeededUsePredicate:
5478     LLVM_DEBUG(
5479         dbgs() << "LV: vector predicate hint/switch found.\n"
5480                << "LV: Not allowing scalar epilogue, creating predicated "
5481                << "vector loop.\n");
5482     break;
5483   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5484     // fallthrough as a special case of OptForSize
5485   case CM_ScalarEpilogueNotAllowedOptSize:
5486     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5487       LLVM_DEBUG(
5488           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5489     else
5490       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5491                         << "count.\n");
5492 
5493     // Bail if runtime checks are required, which are not good when optimising
5494     // for size.
5495     if (runtimeChecksRequired())
5496       return None;
5497 
5498     break;
5499   }
5500 
5501   // The only loops we can vectorize without a scalar epilogue, are loops with
5502   // a bottom-test and a single exiting block. We'd have to handle the fact
5503   // that not every instruction executes on the last iteration.  This will
5504   // require a lane mask which varies through the vector loop body.  (TODO)
5505   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5506     // If there was a tail-folding hint/switch, but we can't fold the tail by
5507     // masking, fallback to a vectorization with a scalar epilogue.
5508     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5509       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5510                            "scalar epilogue instead.\n");
5511       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5512       return MaxVF;
5513     }
5514     return None;
5515   }
5516 
5517   // Now try the tail folding
5518 
5519   // Invalidate interleave groups that require an epilogue if we can't mask
5520   // the interleave-group.
5521   if (!useMaskedInterleavedAccesses(TTI)) {
5522     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5523            "No decisions should have been taken at this point");
5524     // Note: There is no need to invalidate any cost modeling decisions here, as
5525     // non where taken so far.
5526     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5527   }
5528 
5529   assert(!MaxVF.isScalable() &&
5530          "Scalable vectors do not yet support tail folding");
5531   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5532          "MaxVF must be a power of 2");
5533   unsigned MaxVFtimesIC =
5534       UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5535   // Avoid tail folding if the trip count is known to be a multiple of any VF we
5536   // chose.
5537   ScalarEvolution *SE = PSE.getSE();
5538   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5539   const SCEV *ExitCount = SE->getAddExpr(
5540       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5541   const SCEV *Rem = SE->getURemExpr(
5542       ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5543   if (Rem->isZero()) {
5544     // Accept MaxVF if we do not have a tail.
5545     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5546     return MaxVF;
5547   }
5548 
5549   // If we don't know the precise trip count, or if the trip count that we
5550   // found modulo the vectorization factor is not zero, try to fold the tail
5551   // by masking.
5552   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5553   if (Legal->prepareToFoldTailByMasking()) {
5554     FoldTailByMasking = true;
5555     return MaxVF;
5556   }
5557 
5558   // If there was a tail-folding hint/switch, but we can't fold the tail by
5559   // masking, fallback to a vectorization with a scalar epilogue.
5560   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5561     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5562                          "scalar epilogue instead.\n");
5563     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5564     return MaxVF;
5565   }
5566 
5567   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5568     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5569     return None;
5570   }
5571 
5572   if (TC == 0) {
5573     reportVectorizationFailure(
5574         "Unable to calculate the loop count due to complex control flow",
5575         "unable to calculate the loop count due to complex control flow",
5576         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5577     return None;
5578   }
5579 
5580   reportVectorizationFailure(
5581       "Cannot optimize for size and vectorize at the same time.",
5582       "cannot optimize for size and vectorize at the same time. "
5583       "Enable vectorization of this loop with '#pragma clang loop "
5584       "vectorize(enable)' when compiling with -Os/-Oz",
5585       "NoTailLoopWithOptForSize", ORE, TheLoop);
5586   return None;
5587 }
5588 
5589 ElementCount
5590 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5591                                                  ElementCount UserVF) {
5592   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5593   unsigned SmallestType, WidestType;
5594   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5595   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5596 
5597   // Get the maximum safe dependence distance in bits computed by LAA.
5598   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5599   // the memory accesses that is most restrictive (involved in the smallest
5600   // dependence distance).
5601   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5602 
5603   if (UserVF.isNonZero()) {
5604     // For now, don't verify legality of scalable vectors.
5605     // This will be addressed properly in https://reviews.llvm.org/D91718.
5606     if (UserVF.isScalable())
5607       return UserVF;
5608 
5609     // If legally unsafe, clamp the user vectorization factor to a safe value.
5610     unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5611     if (UserVF.getFixedValue() <= MaxSafeVF)
5612       return UserVF;
5613 
5614     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5615                       << " is unsafe, clamping to max safe VF=" << MaxSafeVF
5616                       << ".\n");
5617     ORE->emit([&]() {
5618       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5619                                         TheLoop->getStartLoc(),
5620                                         TheLoop->getHeader())
5621              << "User-specified vectorization factor "
5622              << ore::NV("UserVectorizationFactor", UserVF)
5623              << " is unsafe, clamping to maximum safe vectorization factor "
5624              << ore::NV("VectorizationFactor", MaxSafeVF);
5625     });
5626     return ElementCount::getFixed(MaxSafeVF);
5627   }
5628 
5629   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5630 
5631   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5632   // Note that both WidestRegister and WidestType may not be a powers of 2.
5633   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5634 
5635   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5636                     << " / " << WidestType << " bits.\n");
5637   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5638                     << WidestRegister << " bits.\n");
5639 
5640   assert(MaxVectorSize <= WidestRegister &&
5641          "Did not expect to pack so many elements"
5642          " into one vector!");
5643   if (MaxVectorSize == 0) {
5644     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5645     MaxVectorSize = 1;
5646     return ElementCount::getFixed(MaxVectorSize);
5647   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5648              isPowerOf2_32(ConstTripCount)) {
5649     // We need to clamp the VF to be the ConstTripCount. There is no point in
5650     // choosing a higher viable VF as done in the loop below.
5651     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5652                       << ConstTripCount << "\n");
5653     MaxVectorSize = ConstTripCount;
5654     return ElementCount::getFixed(MaxVectorSize);
5655   }
5656 
5657   unsigned MaxVF = MaxVectorSize;
5658   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5659       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5660     // Collect all viable vectorization factors larger than the default MaxVF
5661     // (i.e. MaxVectorSize).
5662     SmallVector<ElementCount, 8> VFs;
5663     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5664     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5665       VFs.push_back(ElementCount::getFixed(VS));
5666 
5667     // For each VF calculate its register usage.
5668     auto RUs = calculateRegisterUsage(VFs);
5669 
5670     // Select the largest VF which doesn't require more registers than existing
5671     // ones.
5672     for (int i = RUs.size() - 1; i >= 0; --i) {
5673       bool Selected = true;
5674       for (auto& pair : RUs[i].MaxLocalUsers) {
5675         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5676         if (pair.second > TargetNumRegisters)
5677           Selected = false;
5678       }
5679       if (Selected) {
5680         MaxVF = VFs[i].getKnownMinValue();
5681         break;
5682       }
5683     }
5684     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5685       if (MaxVF < MinVF) {
5686         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5687                           << ") with target's minimum: " << MinVF << '\n');
5688         MaxVF = MinVF;
5689       }
5690     }
5691   }
5692   return ElementCount::getFixed(MaxVF);
5693 }
5694 
5695 VectorizationFactor
5696 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5697   // FIXME: This can be fixed for scalable vectors later, because at this stage
5698   // the LoopVectorizer will only consider vectorizing a loop with scalable
5699   // vectors when the loop has a hint to enable vectorization for a given VF.
5700   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5701 
5702   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5703   const float ScalarCost = Cost;
5704   unsigned Width = 1;
5705   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5706 
5707   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5708   if (ForceVectorization && MaxVF.isVector()) {
5709     // Ignore scalar width, because the user explicitly wants vectorization.
5710     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5711     // evaluation.
5712     Cost = std::numeric_limits<float>::max();
5713   }
5714 
5715   for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
5716     // Notice that the vector loop needs to be executed less times, so
5717     // we need to divide the cost of the vector loops by the width of
5718     // the vector elements.
5719     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5720     float VectorCost = C.first / (float)i;
5721     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5722                       << " costs: " << (int)VectorCost << ".\n");
5723     if (!C.second && !ForceVectorization) {
5724       LLVM_DEBUG(
5725           dbgs() << "LV: Not considering vector loop of width " << i
5726                  << " because it will not generate any vector instructions.\n");
5727       continue;
5728     }
5729 
5730     // If profitable add it to ProfitableVF list.
5731     if (VectorCost < ScalarCost) {
5732       ProfitableVFs.push_back(VectorizationFactor(
5733           {ElementCount::getFixed(i), (unsigned)VectorCost}));
5734     }
5735 
5736     if (VectorCost < Cost) {
5737       Cost = VectorCost;
5738       Width = i;
5739     }
5740   }
5741 
5742   if (!EnableCondStoresVectorization && NumPredStores) {
5743     reportVectorizationFailure("There are conditional stores.",
5744         "store that is conditionally executed prevents vectorization",
5745         "ConditionalStore", ORE, TheLoop);
5746     Width = 1;
5747     Cost = ScalarCost;
5748   }
5749 
5750   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5751              << "LV: Vectorization seems to be not beneficial, "
5752              << "but was forced by a user.\n");
5753   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5754   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5755                                 (unsigned)(Width * Cost)};
5756   return Factor;
5757 }
5758 
5759 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5760     const Loop &L, ElementCount VF) const {
5761   // Cross iteration phis such as reductions need special handling and are
5762   // currently unsupported.
5763   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5764         return Legal->isFirstOrderRecurrence(&Phi) ||
5765                Legal->isReductionVariable(&Phi);
5766       }))
5767     return false;
5768 
5769   // Phis with uses outside of the loop require special handling and are
5770   // currently unsupported.
5771   for (auto &Entry : Legal->getInductionVars()) {
5772     // Look for uses of the value of the induction at the last iteration.
5773     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5774     for (User *U : PostInc->users())
5775       if (!L.contains(cast<Instruction>(U)))
5776         return false;
5777     // Look for uses of penultimate value of the induction.
5778     for (User *U : Entry.first->users())
5779       if (!L.contains(cast<Instruction>(U)))
5780         return false;
5781   }
5782 
5783   // Induction variables that are widened require special handling that is
5784   // currently not supported.
5785   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5786         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5787                  this->isProfitableToScalarize(Entry.first, VF));
5788       }))
5789     return false;
5790 
5791   return true;
5792 }
5793 
5794 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5795     const ElementCount VF) const {
5796   // FIXME: We need a much better cost-model to take different parameters such
5797   // as register pressure, code size increase and cost of extra branches into
5798   // account. For now we apply a very crude heuristic and only consider loops
5799   // with vectorization factors larger than a certain value.
5800   // We also consider epilogue vectorization unprofitable for targets that don't
5801   // consider interleaving beneficial (eg. MVE).
5802   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5803     return false;
5804   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5805     return true;
5806   return false;
5807 }
5808 
5809 VectorizationFactor
5810 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5811     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5812   VectorizationFactor Result = VectorizationFactor::Disabled();
5813   if (!EnableEpilogueVectorization) {
5814     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5815     return Result;
5816   }
5817 
5818   if (!isScalarEpilogueAllowed()) {
5819     LLVM_DEBUG(
5820         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5821                   "allowed.\n";);
5822     return Result;
5823   }
5824 
5825   // FIXME: This can be fixed for scalable vectors later, because at this stage
5826   // the LoopVectorizer will only consider vectorizing a loop with scalable
5827   // vectors when the loop has a hint to enable vectorization for a given VF.
5828   if (MainLoopVF.isScalable()) {
5829     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
5830                          "yet supported.\n");
5831     return Result;
5832   }
5833 
5834   // Not really a cost consideration, but check for unsupported cases here to
5835   // simplify the logic.
5836   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5837     LLVM_DEBUG(
5838         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5839                   "not a supported candidate.\n";);
5840     return Result;
5841   }
5842 
5843   if (EpilogueVectorizationForceVF > 1) {
5844     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5845     if (LVP.hasPlanWithVFs(
5846             {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
5847       return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
5848     else {
5849       LLVM_DEBUG(
5850           dbgs()
5851               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5852       return Result;
5853     }
5854   }
5855 
5856   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5857       TheLoop->getHeader()->getParent()->hasMinSize()) {
5858     LLVM_DEBUG(
5859         dbgs()
5860             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5861     return Result;
5862   }
5863 
5864   if (!isEpilogueVectorizationProfitable(MainLoopVF))
5865     return Result;
5866 
5867   for (auto &NextVF : ProfitableVFs)
5868     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
5869         (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
5870         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
5871       Result = NextVF;
5872 
5873   if (Result != VectorizationFactor::Disabled())
5874     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5875                       << Result.Width.getFixedValue() << "\n";);
5876   return Result;
5877 }
5878 
5879 std::pair<unsigned, unsigned>
5880 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5881   unsigned MinWidth = -1U;
5882   unsigned MaxWidth = 8;
5883   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5884 
5885   // For each block.
5886   for (BasicBlock *BB : TheLoop->blocks()) {
5887     // For each instruction in the loop.
5888     for (Instruction &I : BB->instructionsWithoutDebug()) {
5889       Type *T = I.getType();
5890 
5891       // Skip ignored values.
5892       if (ValuesToIgnore.count(&I))
5893         continue;
5894 
5895       // Only examine Loads, Stores and PHINodes.
5896       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5897         continue;
5898 
5899       // Examine PHI nodes that are reduction variables. Update the type to
5900       // account for the recurrence type.
5901       if (auto *PN = dyn_cast<PHINode>(&I)) {
5902         if (!Legal->isReductionVariable(PN))
5903           continue;
5904         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5905         T = RdxDesc.getRecurrenceType();
5906       }
5907 
5908       // Examine the stored values.
5909       if (auto *ST = dyn_cast<StoreInst>(&I))
5910         T = ST->getValueOperand()->getType();
5911 
5912       // Ignore loaded pointer types and stored pointer types that are not
5913       // vectorizable.
5914       //
5915       // FIXME: The check here attempts to predict whether a load or store will
5916       //        be vectorized. We only know this for certain after a VF has
5917       //        been selected. Here, we assume that if an access can be
5918       //        vectorized, it will be. We should also look at extending this
5919       //        optimization to non-pointer types.
5920       //
5921       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5922           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5923         continue;
5924 
5925       MinWidth = std::min(MinWidth,
5926                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5927       MaxWidth = std::max(MaxWidth,
5928                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5929     }
5930   }
5931 
5932   return {MinWidth, MaxWidth};
5933 }
5934 
5935 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5936                                                            unsigned LoopCost) {
5937   // -- The interleave heuristics --
5938   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5939   // There are many micro-architectural considerations that we can't predict
5940   // at this level. For example, frontend pressure (on decode or fetch) due to
5941   // code size, or the number and capabilities of the execution ports.
5942   //
5943   // We use the following heuristics to select the interleave count:
5944   // 1. If the code has reductions, then we interleave to break the cross
5945   // iteration dependency.
5946   // 2. If the loop is really small, then we interleave to reduce the loop
5947   // overhead.
5948   // 3. We don't interleave if we think that we will spill registers to memory
5949   // due to the increased register pressure.
5950 
5951   if (!isScalarEpilogueAllowed())
5952     return 1;
5953 
5954   // We used the distance for the interleave count.
5955   if (Legal->getMaxSafeDepDistBytes() != -1U)
5956     return 1;
5957 
5958   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5959   const bool HasReductions = !Legal->getReductionVars().empty();
5960   // Do not interleave loops with a relatively small known or estimated trip
5961   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5962   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5963   // because with the above conditions interleaving can expose ILP and break
5964   // cross iteration dependences for reductions.
5965   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5966       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5967     return 1;
5968 
5969   RegisterUsage R = calculateRegisterUsage({VF})[0];
5970   // We divide by these constants so assume that we have at least one
5971   // instruction that uses at least one register.
5972   for (auto& pair : R.MaxLocalUsers) {
5973     pair.second = std::max(pair.second, 1U);
5974   }
5975 
5976   // We calculate the interleave count using the following formula.
5977   // Subtract the number of loop invariants from the number of available
5978   // registers. These registers are used by all of the interleaved instances.
5979   // Next, divide the remaining registers by the number of registers that is
5980   // required by the loop, in order to estimate how many parallel instances
5981   // fit without causing spills. All of this is rounded down if necessary to be
5982   // a power of two. We want power of two interleave count to simplify any
5983   // addressing operations or alignment considerations.
5984   // We also want power of two interleave counts to ensure that the induction
5985   // variable of the vector loop wraps to zero, when tail is folded by masking;
5986   // this currently happens when OptForSize, in which case IC is set to 1 above.
5987   unsigned IC = UINT_MAX;
5988 
5989   for (auto& pair : R.MaxLocalUsers) {
5990     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5991     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5992                       << " registers of "
5993                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5994     if (VF.isScalar()) {
5995       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5996         TargetNumRegisters = ForceTargetNumScalarRegs;
5997     } else {
5998       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5999         TargetNumRegisters = ForceTargetNumVectorRegs;
6000     }
6001     unsigned MaxLocalUsers = pair.second;
6002     unsigned LoopInvariantRegs = 0;
6003     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6004       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6005 
6006     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6007     // Don't count the induction variable as interleaved.
6008     if (EnableIndVarRegisterHeur) {
6009       TmpIC =
6010           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6011                         std::max(1U, (MaxLocalUsers - 1)));
6012     }
6013 
6014     IC = std::min(IC, TmpIC);
6015   }
6016 
6017   // Clamp the interleave ranges to reasonable counts.
6018   unsigned MaxInterleaveCount =
6019       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6020 
6021   // Check if the user has overridden the max.
6022   if (VF.isScalar()) {
6023     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6024       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6025   } else {
6026     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6027       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6028   }
6029 
6030   // If trip count is known or estimated compile time constant, limit the
6031   // interleave count to be less than the trip count divided by VF, provided it
6032   // is at least 1.
6033   //
6034   // For scalable vectors we can't know if interleaving is beneficial. It may
6035   // not be beneficial for small loops if none of the lanes in the second vector
6036   // iterations is enabled. However, for larger loops, there is likely to be a
6037   // similar benefit as for fixed-width vectors. For now, we choose to leave
6038   // the InterleaveCount as if vscale is '1', although if some information about
6039   // the vector is known (e.g. min vector size), we can make a better decision.
6040   if (BestKnownTC) {
6041     MaxInterleaveCount =
6042         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6043     // Make sure MaxInterleaveCount is greater than 0.
6044     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6045   }
6046 
6047   assert(MaxInterleaveCount > 0 &&
6048          "Maximum interleave count must be greater than 0");
6049 
6050   // Clamp the calculated IC to be between the 1 and the max interleave count
6051   // that the target and trip count allows.
6052   if (IC > MaxInterleaveCount)
6053     IC = MaxInterleaveCount;
6054   else
6055     // Make sure IC is greater than 0.
6056     IC = std::max(1u, IC);
6057 
6058   assert(IC > 0 && "Interleave count must be greater than 0.");
6059 
6060   // If we did not calculate the cost for VF (because the user selected the VF)
6061   // then we calculate the cost of VF here.
6062   if (LoopCost == 0)
6063     LoopCost = expectedCost(VF).first;
6064 
6065   assert(LoopCost && "Non-zero loop cost expected");
6066 
6067   // Interleave if we vectorized this loop and there is a reduction that could
6068   // benefit from interleaving.
6069   if (VF.isVector() && HasReductions) {
6070     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6071     return IC;
6072   }
6073 
6074   // Note that if we've already vectorized the loop we will have done the
6075   // runtime check and so interleaving won't require further checks.
6076   bool InterleavingRequiresRuntimePointerCheck =
6077       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6078 
6079   // We want to interleave small loops in order to reduce the loop overhead and
6080   // potentially expose ILP opportunities.
6081   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6082                     << "LV: IC is " << IC << '\n'
6083                     << "LV: VF is " << VF << '\n');
6084   const bool AggressivelyInterleaveReductions =
6085       TTI.enableAggressiveInterleaving(HasReductions);
6086   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6087     // We assume that the cost overhead is 1 and we use the cost model
6088     // to estimate the cost of the loop and interleave until the cost of the
6089     // loop overhead is about 5% of the cost of the loop.
6090     unsigned SmallIC =
6091         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6092 
6093     // Interleave until store/load ports (estimated by max interleave count) are
6094     // saturated.
6095     unsigned NumStores = Legal->getNumStores();
6096     unsigned NumLoads = Legal->getNumLoads();
6097     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6098     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6099 
6100     // If we have a scalar reduction (vector reductions are already dealt with
6101     // by this point), we can increase the critical path length if the loop
6102     // we're interleaving is inside another loop. Limit, by default to 2, so the
6103     // critical path only gets increased by one reduction operation.
6104     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6105       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6106       SmallIC = std::min(SmallIC, F);
6107       StoresIC = std::min(StoresIC, F);
6108       LoadsIC = std::min(LoadsIC, F);
6109     }
6110 
6111     if (EnableLoadStoreRuntimeInterleave &&
6112         std::max(StoresIC, LoadsIC) > SmallIC) {
6113       LLVM_DEBUG(
6114           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6115       return std::max(StoresIC, LoadsIC);
6116     }
6117 
6118     // If there are scalar reductions and TTI has enabled aggressive
6119     // interleaving for reductions, we will interleave to expose ILP.
6120     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6121         AggressivelyInterleaveReductions) {
6122       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6123       // Interleave no less than SmallIC but not as aggressive as the normal IC
6124       // to satisfy the rare situation when resources are too limited.
6125       return std::max(IC / 2, SmallIC);
6126     } else {
6127       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6128       return SmallIC;
6129     }
6130   }
6131 
6132   // Interleave if this is a large loop (small loops are already dealt with by
6133   // this point) that could benefit from interleaving.
6134   if (AggressivelyInterleaveReductions) {
6135     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6136     return IC;
6137   }
6138 
6139   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6140   return 1;
6141 }
6142 
6143 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6144 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6145   // This function calculates the register usage by measuring the highest number
6146   // of values that are alive at a single location. Obviously, this is a very
6147   // rough estimation. We scan the loop in a topological order in order and
6148   // assign a number to each instruction. We use RPO to ensure that defs are
6149   // met before their users. We assume that each instruction that has in-loop
6150   // users starts an interval. We record every time that an in-loop value is
6151   // used, so we have a list of the first and last occurrences of each
6152   // instruction. Next, we transpose this data structure into a multi map that
6153   // holds the list of intervals that *end* at a specific location. This multi
6154   // map allows us to perform a linear search. We scan the instructions linearly
6155   // and record each time that a new interval starts, by placing it in a set.
6156   // If we find this value in the multi-map then we remove it from the set.
6157   // The max register usage is the maximum size of the set.
6158   // We also search for instructions that are defined outside the loop, but are
6159   // used inside the loop. We need this number separately from the max-interval
6160   // usage number because when we unroll, loop-invariant values do not take
6161   // more register.
6162   LoopBlocksDFS DFS(TheLoop);
6163   DFS.perform(LI);
6164 
6165   RegisterUsage RU;
6166 
6167   // Each 'key' in the map opens a new interval. The values
6168   // of the map are the index of the 'last seen' usage of the
6169   // instruction that is the key.
6170   using IntervalMap = DenseMap<Instruction *, unsigned>;
6171 
6172   // Maps instruction to its index.
6173   SmallVector<Instruction *, 64> IdxToInstr;
6174   // Marks the end of each interval.
6175   IntervalMap EndPoint;
6176   // Saves the list of instruction indices that are used in the loop.
6177   SmallPtrSet<Instruction *, 8> Ends;
6178   // Saves the list of values that are used in the loop but are
6179   // defined outside the loop, such as arguments and constants.
6180   SmallPtrSet<Value *, 8> LoopInvariants;
6181 
6182   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6183     for (Instruction &I : BB->instructionsWithoutDebug()) {
6184       IdxToInstr.push_back(&I);
6185 
6186       // Save the end location of each USE.
6187       for (Value *U : I.operands()) {
6188         auto *Instr = dyn_cast<Instruction>(U);
6189 
6190         // Ignore non-instruction values such as arguments, constants, etc.
6191         if (!Instr)
6192           continue;
6193 
6194         // If this instruction is outside the loop then record it and continue.
6195         if (!TheLoop->contains(Instr)) {
6196           LoopInvariants.insert(Instr);
6197           continue;
6198         }
6199 
6200         // Overwrite previous end points.
6201         EndPoint[Instr] = IdxToInstr.size();
6202         Ends.insert(Instr);
6203       }
6204     }
6205   }
6206 
6207   // Saves the list of intervals that end with the index in 'key'.
6208   using InstrList = SmallVector<Instruction *, 2>;
6209   DenseMap<unsigned, InstrList> TransposeEnds;
6210 
6211   // Transpose the EndPoints to a list of values that end at each index.
6212   for (auto &Interval : EndPoint)
6213     TransposeEnds[Interval.second].push_back(Interval.first);
6214 
6215   SmallPtrSet<Instruction *, 8> OpenIntervals;
6216   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6217   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6218 
6219   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6220 
6221   // A lambda that gets the register usage for the given type and VF.
6222   const auto &TTICapture = TTI;
6223   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6224     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6225       return 0U;
6226     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6227   };
6228 
6229   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6230     Instruction *I = IdxToInstr[i];
6231 
6232     // Remove all of the instructions that end at this location.
6233     InstrList &List = TransposeEnds[i];
6234     for (Instruction *ToRemove : List)
6235       OpenIntervals.erase(ToRemove);
6236 
6237     // Ignore instructions that are never used within the loop.
6238     if (!Ends.count(I))
6239       continue;
6240 
6241     // Skip ignored values.
6242     if (ValuesToIgnore.count(I))
6243       continue;
6244 
6245     // For each VF find the maximum usage of registers.
6246     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6247       // Count the number of live intervals.
6248       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6249 
6250       if (VFs[j].isScalar()) {
6251         for (auto Inst : OpenIntervals) {
6252           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6253           if (RegUsage.find(ClassID) == RegUsage.end())
6254             RegUsage[ClassID] = 1;
6255           else
6256             RegUsage[ClassID] += 1;
6257         }
6258       } else {
6259         collectUniformsAndScalars(VFs[j]);
6260         for (auto Inst : OpenIntervals) {
6261           // Skip ignored values for VF > 1.
6262           if (VecValuesToIgnore.count(Inst))
6263             continue;
6264           if (isScalarAfterVectorization(Inst, VFs[j])) {
6265             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6266             if (RegUsage.find(ClassID) == RegUsage.end())
6267               RegUsage[ClassID] = 1;
6268             else
6269               RegUsage[ClassID] += 1;
6270           } else {
6271             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6272             if (RegUsage.find(ClassID) == RegUsage.end())
6273               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6274             else
6275               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6276           }
6277         }
6278       }
6279 
6280       for (auto& pair : RegUsage) {
6281         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6282           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6283         else
6284           MaxUsages[j][pair.first] = pair.second;
6285       }
6286     }
6287 
6288     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6289                       << OpenIntervals.size() << '\n');
6290 
6291     // Add the current instruction to the list of open intervals.
6292     OpenIntervals.insert(I);
6293   }
6294 
6295   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6296     SmallMapVector<unsigned, unsigned, 4> Invariant;
6297 
6298     for (auto Inst : LoopInvariants) {
6299       unsigned Usage =
6300           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6301       unsigned ClassID =
6302           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6303       if (Invariant.find(ClassID) == Invariant.end())
6304         Invariant[ClassID] = Usage;
6305       else
6306         Invariant[ClassID] += Usage;
6307     }
6308 
6309     LLVM_DEBUG({
6310       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6311       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6312              << " item\n";
6313       for (const auto &pair : MaxUsages[i]) {
6314         dbgs() << "LV(REG): RegisterClass: "
6315                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6316                << " registers\n";
6317       }
6318       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6319              << " item\n";
6320       for (const auto &pair : Invariant) {
6321         dbgs() << "LV(REG): RegisterClass: "
6322                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6323                << " registers\n";
6324       }
6325     });
6326 
6327     RU.LoopInvariantRegs = Invariant;
6328     RU.MaxLocalUsers = MaxUsages[i];
6329     RUs[i] = RU;
6330   }
6331 
6332   return RUs;
6333 }
6334 
6335 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6336   // TODO: Cost model for emulated masked load/store is completely
6337   // broken. This hack guides the cost model to use an artificially
6338   // high enough value to practically disable vectorization with such
6339   // operations, except where previously deployed legality hack allowed
6340   // using very low cost values. This is to avoid regressions coming simply
6341   // from moving "masked load/store" check from legality to cost model.
6342   // Masked Load/Gather emulation was previously never allowed.
6343   // Limited number of Masked Store/Scatter emulation was allowed.
6344   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
6345   return isa<LoadInst>(I) ||
6346          (isa<StoreInst>(I) &&
6347           NumPredStores > NumberOfStoresToPredicate);
6348 }
6349 
6350 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6351   // If we aren't vectorizing the loop, or if we've already collected the
6352   // instructions to scalarize, there's nothing to do. Collection may already
6353   // have occurred if we have a user-selected VF and are now computing the
6354   // expected cost for interleaving.
6355   if (VF.isScalar() || VF.isZero() ||
6356       InstsToScalarize.find(VF) != InstsToScalarize.end())
6357     return;
6358 
6359   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6360   // not profitable to scalarize any instructions, the presence of VF in the
6361   // map will indicate that we've analyzed it already.
6362   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6363 
6364   // Find all the instructions that are scalar with predication in the loop and
6365   // determine if it would be better to not if-convert the blocks they are in.
6366   // If so, we also record the instructions to scalarize.
6367   for (BasicBlock *BB : TheLoop->blocks()) {
6368     if (!blockNeedsPredication(BB))
6369       continue;
6370     for (Instruction &I : *BB)
6371       if (isScalarWithPredication(&I)) {
6372         ScalarCostsTy ScalarCosts;
6373         // Do not apply discount logic if hacked cost is needed
6374         // for emulated masked memrefs.
6375         if (!useEmulatedMaskMemRefHack(&I) &&
6376             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6377           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6378         // Remember that BB will remain after vectorization.
6379         PredicatedBBsAfterVectorization.insert(BB);
6380       }
6381   }
6382 }
6383 
6384 int LoopVectorizationCostModel::computePredInstDiscount(
6385     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
6386     ElementCount VF) {
6387   assert(!isUniformAfterVectorization(PredInst, VF) &&
6388          "Instruction marked uniform-after-vectorization will be predicated");
6389 
6390   // Initialize the discount to zero, meaning that the scalar version and the
6391   // vector version cost the same.
6392   int Discount = 0;
6393 
6394   // Holds instructions to analyze. The instructions we visit are mapped in
6395   // ScalarCosts. Those instructions are the ones that would be scalarized if
6396   // we find that the scalar version costs less.
6397   SmallVector<Instruction *, 8> Worklist;
6398 
6399   // Returns true if the given instruction can be scalarized.
6400   auto canBeScalarized = [&](Instruction *I) -> bool {
6401     // We only attempt to scalarize instructions forming a single-use chain
6402     // from the original predicated block that would otherwise be vectorized.
6403     // Although not strictly necessary, we give up on instructions we know will
6404     // already be scalar to avoid traversing chains that are unlikely to be
6405     // beneficial.
6406     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6407         isScalarAfterVectorization(I, VF))
6408       return false;
6409 
6410     // If the instruction is scalar with predication, it will be analyzed
6411     // separately. We ignore it within the context of PredInst.
6412     if (isScalarWithPredication(I))
6413       return false;
6414 
6415     // If any of the instruction's operands are uniform after vectorization,
6416     // the instruction cannot be scalarized. This prevents, for example, a
6417     // masked load from being scalarized.
6418     //
6419     // We assume we will only emit a value for lane zero of an instruction
6420     // marked uniform after vectorization, rather than VF identical values.
6421     // Thus, if we scalarize an instruction that uses a uniform, we would
6422     // create uses of values corresponding to the lanes we aren't emitting code
6423     // for. This behavior can be changed by allowing getScalarValue to clone
6424     // the lane zero values for uniforms rather than asserting.
6425     for (Use &U : I->operands())
6426       if (auto *J = dyn_cast<Instruction>(U.get()))
6427         if (isUniformAfterVectorization(J, VF))
6428           return false;
6429 
6430     // Otherwise, we can scalarize the instruction.
6431     return true;
6432   };
6433 
6434   // Compute the expected cost discount from scalarizing the entire expression
6435   // feeding the predicated instruction. We currently only consider expressions
6436   // that are single-use instruction chains.
6437   Worklist.push_back(PredInst);
6438   while (!Worklist.empty()) {
6439     Instruction *I = Worklist.pop_back_val();
6440 
6441     // If we've already analyzed the instruction, there's nothing to do.
6442     if (ScalarCosts.find(I) != ScalarCosts.end())
6443       continue;
6444 
6445     // Compute the cost of the vector instruction. Note that this cost already
6446     // includes the scalarization overhead of the predicated instruction.
6447     unsigned VectorCost = getInstructionCost(I, VF).first;
6448 
6449     // Compute the cost of the scalarized instruction. This cost is the cost of
6450     // the instruction as if it wasn't if-converted and instead remained in the
6451     // predicated block. We will scale this cost by block probability after
6452     // computing the scalarization overhead.
6453     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6454     unsigned ScalarCost =
6455         VF.getKnownMinValue() *
6456         getInstructionCost(I, ElementCount::getFixed(1)).first;
6457 
6458     // Compute the scalarization overhead of needed insertelement instructions
6459     // and phi nodes.
6460     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6461       ScalarCost += TTI.getScalarizationOverhead(
6462           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6463           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6464       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6465       ScalarCost +=
6466           VF.getKnownMinValue() *
6467           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6468     }
6469 
6470     // Compute the scalarization overhead of needed extractelement
6471     // instructions. For each of the instruction's operands, if the operand can
6472     // be scalarized, add it to the worklist; otherwise, account for the
6473     // overhead.
6474     for (Use &U : I->operands())
6475       if (auto *J = dyn_cast<Instruction>(U.get())) {
6476         assert(VectorType::isValidElementType(J->getType()) &&
6477                "Instruction has non-scalar type");
6478         if (canBeScalarized(J))
6479           Worklist.push_back(J);
6480         else if (needsExtract(J, VF)) {
6481           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6482           ScalarCost += TTI.getScalarizationOverhead(
6483               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6484               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6485         }
6486       }
6487 
6488     // Scale the total scalar cost by block probability.
6489     ScalarCost /= getReciprocalPredBlockProb();
6490 
6491     // Compute the discount. A non-negative discount means the vector version
6492     // of the instruction costs more, and scalarizing would be beneficial.
6493     Discount += VectorCost - ScalarCost;
6494     ScalarCosts[I] = ScalarCost;
6495   }
6496 
6497   return Discount;
6498 }
6499 
6500 LoopVectorizationCostModel::VectorizationCostTy
6501 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6502   VectorizationCostTy Cost;
6503 
6504   // For each block.
6505   for (BasicBlock *BB : TheLoop->blocks()) {
6506     VectorizationCostTy BlockCost;
6507 
6508     // For each instruction in the old loop.
6509     for (Instruction &I : BB->instructionsWithoutDebug()) {
6510       // Skip ignored values.
6511       if (ValuesToIgnore.count(&I) ||
6512           (VF.isVector() && VecValuesToIgnore.count(&I)))
6513         continue;
6514 
6515       VectorizationCostTy C = getInstructionCost(&I, VF);
6516 
6517       // Check if we should override the cost.
6518       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6519         C.first = ForceTargetInstructionCost;
6520 
6521       BlockCost.first += C.first;
6522       BlockCost.second |= C.second;
6523       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6524                         << " for VF " << VF << " For instruction: " << I
6525                         << '\n');
6526     }
6527 
6528     // If we are vectorizing a predicated block, it will have been
6529     // if-converted. This means that the block's instructions (aside from
6530     // stores and instructions that may divide by zero) will now be
6531     // unconditionally executed. For the scalar case, we may not always execute
6532     // the predicated block, if it is an if-else block. Thus, scale the block's
6533     // cost by the probability of executing it. blockNeedsPredication from
6534     // Legal is used so as to not include all blocks in tail folded loops.
6535     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6536       BlockCost.first /= getReciprocalPredBlockProb();
6537 
6538     Cost.first += BlockCost.first;
6539     Cost.second |= BlockCost.second;
6540   }
6541 
6542   return Cost;
6543 }
6544 
6545 /// Gets Address Access SCEV after verifying that the access pattern
6546 /// is loop invariant except the induction variable dependence.
6547 ///
6548 /// This SCEV can be sent to the Target in order to estimate the address
6549 /// calculation cost.
6550 static const SCEV *getAddressAccessSCEV(
6551               Value *Ptr,
6552               LoopVectorizationLegality *Legal,
6553               PredicatedScalarEvolution &PSE,
6554               const Loop *TheLoop) {
6555 
6556   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6557   if (!Gep)
6558     return nullptr;
6559 
6560   // We are looking for a gep with all loop invariant indices except for one
6561   // which should be an induction variable.
6562   auto SE = PSE.getSE();
6563   unsigned NumOperands = Gep->getNumOperands();
6564   for (unsigned i = 1; i < NumOperands; ++i) {
6565     Value *Opd = Gep->getOperand(i);
6566     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6567         !Legal->isInductionVariable(Opd))
6568       return nullptr;
6569   }
6570 
6571   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6572   return PSE.getSCEV(Ptr);
6573 }
6574 
6575 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6576   return Legal->hasStride(I->getOperand(0)) ||
6577          Legal->hasStride(I->getOperand(1));
6578 }
6579 
6580 unsigned
6581 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6582                                                         ElementCount VF) {
6583   assert(VF.isVector() &&
6584          "Scalarization cost of instruction implies vectorization.");
6585   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6586   Type *ValTy = getMemInstValueType(I);
6587   auto SE = PSE.getSE();
6588 
6589   unsigned AS = getLoadStoreAddressSpace(I);
6590   Value *Ptr = getLoadStorePointerOperand(I);
6591   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6592 
6593   // Figure out whether the access is strided and get the stride value
6594   // if it's known in compile time
6595   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6596 
6597   // Get the cost of the scalar memory instruction and address computation.
6598   unsigned Cost =
6599       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6600 
6601   // Don't pass *I here, since it is scalar but will actually be part of a
6602   // vectorized loop where the user of it is a vectorized instruction.
6603   const Align Alignment = getLoadStoreAlignment(I);
6604   Cost += VF.getKnownMinValue() *
6605           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6606                               AS, TTI::TCK_RecipThroughput);
6607 
6608   // Get the overhead of the extractelement and insertelement instructions
6609   // we might create due to scalarization.
6610   Cost += getScalarizationOverhead(I, VF);
6611 
6612   // If we have a predicated store, it may not be executed for each vector
6613   // lane. Scale the cost by the probability of executing the predicated
6614   // block.
6615   if (isPredicatedInst(I)) {
6616     Cost /= getReciprocalPredBlockProb();
6617 
6618     if (useEmulatedMaskMemRefHack(I))
6619       // Artificially setting to a high enough value to practically disable
6620       // vectorization with such operations.
6621       Cost = 3000000;
6622   }
6623 
6624   return Cost;
6625 }
6626 
6627 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6628                                                              ElementCount VF) {
6629   Type *ValTy = getMemInstValueType(I);
6630   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6631   Value *Ptr = getLoadStorePointerOperand(I);
6632   unsigned AS = getLoadStoreAddressSpace(I);
6633   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6634   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6635 
6636   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6637          "Stride should be 1 or -1 for consecutive memory access");
6638   const Align Alignment = getLoadStoreAlignment(I);
6639   unsigned Cost = 0;
6640   if (Legal->isMaskRequired(I))
6641     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6642                                       CostKind);
6643   else
6644     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6645                                 CostKind, I);
6646 
6647   bool Reverse = ConsecutiveStride < 0;
6648   if (Reverse)
6649     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6650   return Cost;
6651 }
6652 
6653 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6654                                                          ElementCount VF) {
6655   assert(Legal->isUniformMemOp(*I));
6656 
6657   Type *ValTy = getMemInstValueType(I);
6658   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6659   const Align Alignment = getLoadStoreAlignment(I);
6660   unsigned AS = getLoadStoreAddressSpace(I);
6661   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6662   if (isa<LoadInst>(I)) {
6663     return TTI.getAddressComputationCost(ValTy) +
6664            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6665                                CostKind) +
6666            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6667   }
6668   StoreInst *SI = cast<StoreInst>(I);
6669 
6670   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6671   return TTI.getAddressComputationCost(ValTy) +
6672          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6673                              CostKind) +
6674          (isLoopInvariantStoreValue
6675               ? 0
6676               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6677                                        VF.getKnownMinValue() - 1));
6678 }
6679 
6680 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6681                                                           ElementCount VF) {
6682   Type *ValTy = getMemInstValueType(I);
6683   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6684   const Align Alignment = getLoadStoreAlignment(I);
6685   const Value *Ptr = getLoadStorePointerOperand(I);
6686 
6687   return TTI.getAddressComputationCost(VectorTy) +
6688          TTI.getGatherScatterOpCost(
6689              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6690              TargetTransformInfo::TCK_RecipThroughput, I);
6691 }
6692 
6693 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6694                                                             ElementCount VF) {
6695   Type *ValTy = getMemInstValueType(I);
6696   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6697   unsigned AS = getLoadStoreAddressSpace(I);
6698 
6699   auto Group = getInterleavedAccessGroup(I);
6700   assert(Group && "Fail to get an interleaved access group.");
6701 
6702   unsigned InterleaveFactor = Group->getFactor();
6703   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6704   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6705 
6706   // Holds the indices of existing members in an interleaved load group.
6707   // An interleaved store group doesn't need this as it doesn't allow gaps.
6708   SmallVector<unsigned, 4> Indices;
6709   if (isa<LoadInst>(I)) {
6710     for (unsigned i = 0; i < InterleaveFactor; i++)
6711       if (Group->getMember(i))
6712         Indices.push_back(i);
6713   }
6714 
6715   // Calculate the cost of the whole interleaved group.
6716   bool UseMaskForGaps =
6717       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6718   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6719       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6720       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6721 
6722   if (Group->isReverse()) {
6723     // TODO: Add support for reversed masked interleaved access.
6724     assert(!Legal->isMaskRequired(I) &&
6725            "Reverse masked interleaved access not supported.");
6726     Cost += Group->getNumMembers() *
6727             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6728   }
6729   return Cost;
6730 }
6731 
6732 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6733                                                               ElementCount VF) {
6734   // Calculate scalar cost only. Vectorization cost should be ready at this
6735   // moment.
6736   if (VF.isScalar()) {
6737     Type *ValTy = getMemInstValueType(I);
6738     const Align Alignment = getLoadStoreAlignment(I);
6739     unsigned AS = getLoadStoreAddressSpace(I);
6740 
6741     return TTI.getAddressComputationCost(ValTy) +
6742            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6743                                TTI::TCK_RecipThroughput, I);
6744   }
6745   return getWideningCost(I, VF);
6746 }
6747 
6748 LoopVectorizationCostModel::VectorizationCostTy
6749 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6750                                                ElementCount VF) {
6751   // If we know that this instruction will remain uniform, check the cost of
6752   // the scalar version.
6753   if (isUniformAfterVectorization(I, VF))
6754     VF = ElementCount::getFixed(1);
6755 
6756   if (VF.isVector() && isProfitableToScalarize(I, VF))
6757     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6758 
6759   // Forced scalars do not have any scalarization overhead.
6760   auto ForcedScalar = ForcedScalars.find(VF);
6761   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6762     auto InstSet = ForcedScalar->second;
6763     if (InstSet.count(I))
6764       return VectorizationCostTy(
6765           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6766            VF.getKnownMinValue()),
6767           false);
6768   }
6769 
6770   Type *VectorTy;
6771   unsigned C = getInstructionCost(I, VF, VectorTy);
6772 
6773   bool TypeNotScalarized =
6774       VF.isVector() && VectorTy->isVectorTy() &&
6775       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6776   return VectorizationCostTy(C, TypeNotScalarized);
6777 }
6778 
6779 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6780                                                               ElementCount VF) {
6781 
6782   assert(!VF.isScalable() &&
6783          "cannot compute scalarization overhead for scalable vectorization");
6784   if (VF.isScalar())
6785     return 0;
6786 
6787   unsigned Cost = 0;
6788   Type *RetTy = ToVectorTy(I->getType(), VF);
6789   if (!RetTy->isVoidTy() &&
6790       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6791     Cost += TTI.getScalarizationOverhead(
6792         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6793         true, false);
6794 
6795   // Some targets keep addresses scalar.
6796   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6797     return Cost;
6798 
6799   // Some targets support efficient element stores.
6800   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6801     return Cost;
6802 
6803   // Collect operands to consider.
6804   CallInst *CI = dyn_cast<CallInst>(I);
6805   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6806 
6807   // Skip operands that do not require extraction/scalarization and do not incur
6808   // any overhead.
6809   return Cost + TTI.getOperandsScalarizationOverhead(
6810                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6811 }
6812 
6813 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6814   if (VF.isScalar())
6815     return;
6816   NumPredStores = 0;
6817   for (BasicBlock *BB : TheLoop->blocks()) {
6818     // For each instruction in the old loop.
6819     for (Instruction &I : *BB) {
6820       Value *Ptr =  getLoadStorePointerOperand(&I);
6821       if (!Ptr)
6822         continue;
6823 
6824       // TODO: We should generate better code and update the cost model for
6825       // predicated uniform stores. Today they are treated as any other
6826       // predicated store (see added test cases in
6827       // invariant-store-vectorization.ll).
6828       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6829         NumPredStores++;
6830 
6831       if (Legal->isUniformMemOp(I)) {
6832         // TODO: Avoid replicating loads and stores instead of
6833         // relying on instcombine to remove them.
6834         // Load: Scalar load + broadcast
6835         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6836         unsigned Cost = getUniformMemOpCost(&I, VF);
6837         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6838         continue;
6839       }
6840 
6841       // We assume that widening is the best solution when possible.
6842       if (memoryInstructionCanBeWidened(&I, VF)) {
6843         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6844         int ConsecutiveStride =
6845                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6846         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6847                "Expected consecutive stride.");
6848         InstWidening Decision =
6849             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6850         setWideningDecision(&I, VF, Decision, Cost);
6851         continue;
6852       }
6853 
6854       // Choose between Interleaving, Gather/Scatter or Scalarization.
6855       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6856       unsigned NumAccesses = 1;
6857       if (isAccessInterleaved(&I)) {
6858         auto Group = getInterleavedAccessGroup(&I);
6859         assert(Group && "Fail to get an interleaved access group.");
6860 
6861         // Make one decision for the whole group.
6862         if (getWideningDecision(&I, VF) != CM_Unknown)
6863           continue;
6864 
6865         NumAccesses = Group->getNumMembers();
6866         if (interleavedAccessCanBeWidened(&I, VF))
6867           InterleaveCost = getInterleaveGroupCost(&I, VF);
6868       }
6869 
6870       unsigned GatherScatterCost =
6871           isLegalGatherOrScatter(&I)
6872               ? getGatherScatterCost(&I, VF) * NumAccesses
6873               : std::numeric_limits<unsigned>::max();
6874 
6875       unsigned ScalarizationCost =
6876           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6877 
6878       // Choose better solution for the current VF,
6879       // write down this decision and use it during vectorization.
6880       unsigned Cost;
6881       InstWidening Decision;
6882       if (InterleaveCost <= GatherScatterCost &&
6883           InterleaveCost < ScalarizationCost) {
6884         Decision = CM_Interleave;
6885         Cost = InterleaveCost;
6886       } else if (GatherScatterCost < ScalarizationCost) {
6887         Decision = CM_GatherScatter;
6888         Cost = GatherScatterCost;
6889       } else {
6890         Decision = CM_Scalarize;
6891         Cost = ScalarizationCost;
6892       }
6893       // If the instructions belongs to an interleave group, the whole group
6894       // receives the same decision. The whole group receives the cost, but
6895       // the cost will actually be assigned to one instruction.
6896       if (auto Group = getInterleavedAccessGroup(&I))
6897         setWideningDecision(Group, VF, Decision, Cost);
6898       else
6899         setWideningDecision(&I, VF, Decision, Cost);
6900     }
6901   }
6902 
6903   // Make sure that any load of address and any other address computation
6904   // remains scalar unless there is gather/scatter support. This avoids
6905   // inevitable extracts into address registers, and also has the benefit of
6906   // activating LSR more, since that pass can't optimize vectorized
6907   // addresses.
6908   if (TTI.prefersVectorizedAddressing())
6909     return;
6910 
6911   // Start with all scalar pointer uses.
6912   SmallPtrSet<Instruction *, 8> AddrDefs;
6913   for (BasicBlock *BB : TheLoop->blocks())
6914     for (Instruction &I : *BB) {
6915       Instruction *PtrDef =
6916         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6917       if (PtrDef && TheLoop->contains(PtrDef) &&
6918           getWideningDecision(&I, VF) != CM_GatherScatter)
6919         AddrDefs.insert(PtrDef);
6920     }
6921 
6922   // Add all instructions used to generate the addresses.
6923   SmallVector<Instruction *, 4> Worklist;
6924   for (auto *I : AddrDefs)
6925     Worklist.push_back(I);
6926   while (!Worklist.empty()) {
6927     Instruction *I = Worklist.pop_back_val();
6928     for (auto &Op : I->operands())
6929       if (auto *InstOp = dyn_cast<Instruction>(Op))
6930         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6931             AddrDefs.insert(InstOp).second)
6932           Worklist.push_back(InstOp);
6933   }
6934 
6935   for (auto *I : AddrDefs) {
6936     if (isa<LoadInst>(I)) {
6937       // Setting the desired widening decision should ideally be handled in
6938       // by cost functions, but since this involves the task of finding out
6939       // if the loaded register is involved in an address computation, it is
6940       // instead changed here when we know this is the case.
6941       InstWidening Decision = getWideningDecision(I, VF);
6942       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6943         // Scalarize a widened load of address.
6944         setWideningDecision(
6945             I, VF, CM_Scalarize,
6946             (VF.getKnownMinValue() *
6947              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6948       else if (auto Group = getInterleavedAccessGroup(I)) {
6949         // Scalarize an interleave group of address loads.
6950         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6951           if (Instruction *Member = Group->getMember(I))
6952             setWideningDecision(
6953                 Member, VF, CM_Scalarize,
6954                 (VF.getKnownMinValue() *
6955                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6956         }
6957       }
6958     } else
6959       // Make sure I gets scalarized and a cost estimate without
6960       // scalarization overhead.
6961       ForcedScalars[VF].insert(I);
6962   }
6963 }
6964 
6965 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6966                                                         ElementCount VF,
6967                                                         Type *&VectorTy) {
6968   Type *RetTy = I->getType();
6969   if (canTruncateToMinimalBitwidth(I, VF))
6970     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6971   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6972   auto SE = PSE.getSE();
6973   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6974 
6975   // TODO: We need to estimate the cost of intrinsic calls.
6976   switch (I->getOpcode()) {
6977   case Instruction::GetElementPtr:
6978     // We mark this instruction as zero-cost because the cost of GEPs in
6979     // vectorized code depends on whether the corresponding memory instruction
6980     // is scalarized or not. Therefore, we handle GEPs with the memory
6981     // instruction cost.
6982     return 0;
6983   case Instruction::Br: {
6984     // In cases of scalarized and predicated instructions, there will be VF
6985     // predicated blocks in the vectorized loop. Each branch around these
6986     // blocks requires also an extract of its vector compare i1 element.
6987     bool ScalarPredicatedBB = false;
6988     BranchInst *BI = cast<BranchInst>(I);
6989     if (VF.isVector() && BI->isConditional() &&
6990         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6991          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6992       ScalarPredicatedBB = true;
6993 
6994     if (ScalarPredicatedBB) {
6995       // Return cost for branches around scalarized and predicated blocks.
6996       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6997       auto *Vec_i1Ty =
6998           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6999       return (TTI.getScalarizationOverhead(
7000                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
7001                   false, true) +
7002               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
7003                VF.getKnownMinValue()));
7004     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7005       // The back-edge branch will remain, as will all scalar branches.
7006       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7007     else
7008       // This branch will be eliminated by if-conversion.
7009       return 0;
7010     // Note: We currently assume zero cost for an unconditional branch inside
7011     // a predicated block since it will become a fall-through, although we
7012     // may decide in the future to call TTI for all branches.
7013   }
7014   case Instruction::PHI: {
7015     auto *Phi = cast<PHINode>(I);
7016 
7017     // First-order recurrences are replaced by vector shuffles inside the loop.
7018     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7019     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7020       return TTI.getShuffleCost(
7021           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7022           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7023 
7024     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7025     // converted into select instructions. We require N - 1 selects per phi
7026     // node, where N is the number of incoming values.
7027     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7028       return (Phi->getNumIncomingValues() - 1) *
7029              TTI.getCmpSelInstrCost(
7030                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7031                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7032                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7033 
7034     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7035   }
7036   case Instruction::UDiv:
7037   case Instruction::SDiv:
7038   case Instruction::URem:
7039   case Instruction::SRem:
7040     // If we have a predicated instruction, it may not be executed for each
7041     // vector lane. Get the scalarization cost and scale this amount by the
7042     // probability of executing the predicated block. If the instruction is not
7043     // predicated, we fall through to the next case.
7044     if (VF.isVector() && isScalarWithPredication(I)) {
7045       unsigned Cost = 0;
7046 
7047       // These instructions have a non-void type, so account for the phi nodes
7048       // that we will create. This cost is likely to be zero. The phi node
7049       // cost, if any, should be scaled by the block probability because it
7050       // models a copy at the end of each predicated block.
7051       Cost += VF.getKnownMinValue() *
7052               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7053 
7054       // The cost of the non-predicated instruction.
7055       Cost += VF.getKnownMinValue() *
7056               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7057 
7058       // The cost of insertelement and extractelement instructions needed for
7059       // scalarization.
7060       Cost += getScalarizationOverhead(I, VF);
7061 
7062       // Scale the cost by the probability of executing the predicated blocks.
7063       // This assumes the predicated block for each vector lane is equally
7064       // likely.
7065       return Cost / getReciprocalPredBlockProb();
7066     }
7067     LLVM_FALLTHROUGH;
7068   case Instruction::Add:
7069   case Instruction::FAdd:
7070   case Instruction::Sub:
7071   case Instruction::FSub:
7072   case Instruction::Mul:
7073   case Instruction::FMul:
7074   case Instruction::FDiv:
7075   case Instruction::FRem:
7076   case Instruction::Shl:
7077   case Instruction::LShr:
7078   case Instruction::AShr:
7079   case Instruction::And:
7080   case Instruction::Or:
7081   case Instruction::Xor: {
7082     // Since we will replace the stride by 1 the multiplication should go away.
7083     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7084       return 0;
7085     // Certain instructions can be cheaper to vectorize if they have a constant
7086     // second vector operand. One example of this are shifts on x86.
7087     Value *Op2 = I->getOperand(1);
7088     TargetTransformInfo::OperandValueProperties Op2VP;
7089     TargetTransformInfo::OperandValueKind Op2VK =
7090         TTI.getOperandInfo(Op2, Op2VP);
7091     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7092       Op2VK = TargetTransformInfo::OK_UniformValue;
7093 
7094     SmallVector<const Value *, 4> Operands(I->operand_values());
7095     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7096     return N * TTI.getArithmeticInstrCost(
7097                    I->getOpcode(), VectorTy, CostKind,
7098                    TargetTransformInfo::OK_AnyValue,
7099                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7100   }
7101   case Instruction::FNeg: {
7102     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7103     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7104     return N * TTI.getArithmeticInstrCost(
7105                    I->getOpcode(), VectorTy, CostKind,
7106                    TargetTransformInfo::OK_AnyValue,
7107                    TargetTransformInfo::OK_AnyValue,
7108                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
7109                    I->getOperand(0), I);
7110   }
7111   case Instruction::Select: {
7112     SelectInst *SI = cast<SelectInst>(I);
7113     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7114     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7115     Type *CondTy = SI->getCondition()->getType();
7116     if (!ScalarCond) {
7117       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7118       CondTy = VectorType::get(CondTy, VF);
7119     }
7120     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7121                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7122   }
7123   case Instruction::ICmp:
7124   case Instruction::FCmp: {
7125     Type *ValTy = I->getOperand(0)->getType();
7126     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7127     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7128       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7129     VectorTy = ToVectorTy(ValTy, VF);
7130     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7131                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7132   }
7133   case Instruction::Store:
7134   case Instruction::Load: {
7135     ElementCount Width = VF;
7136     if (Width.isVector()) {
7137       InstWidening Decision = getWideningDecision(I, Width);
7138       assert(Decision != CM_Unknown &&
7139              "CM decision should be taken at this point");
7140       if (Decision == CM_Scalarize)
7141         Width = ElementCount::getFixed(1);
7142     }
7143     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7144     return getMemoryInstructionCost(I, VF);
7145   }
7146   case Instruction::ZExt:
7147   case Instruction::SExt:
7148   case Instruction::FPToUI:
7149   case Instruction::FPToSI:
7150   case Instruction::FPExt:
7151   case Instruction::PtrToInt:
7152   case Instruction::IntToPtr:
7153   case Instruction::SIToFP:
7154   case Instruction::UIToFP:
7155   case Instruction::Trunc:
7156   case Instruction::FPTrunc:
7157   case Instruction::BitCast: {
7158     // Computes the CastContextHint from a Load/Store instruction.
7159     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7160       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7161              "Expected a load or a store!");
7162 
7163       if (VF.isScalar() || !TheLoop->contains(I))
7164         return TTI::CastContextHint::Normal;
7165 
7166       switch (getWideningDecision(I, VF)) {
7167       case LoopVectorizationCostModel::CM_GatherScatter:
7168         return TTI::CastContextHint::GatherScatter;
7169       case LoopVectorizationCostModel::CM_Interleave:
7170         return TTI::CastContextHint::Interleave;
7171       case LoopVectorizationCostModel::CM_Scalarize:
7172       case LoopVectorizationCostModel::CM_Widen:
7173         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7174                                         : TTI::CastContextHint::Normal;
7175       case LoopVectorizationCostModel::CM_Widen_Reverse:
7176         return TTI::CastContextHint::Reversed;
7177       case LoopVectorizationCostModel::CM_Unknown:
7178         llvm_unreachable("Instr did not go through cost modelling?");
7179       }
7180 
7181       llvm_unreachable("Unhandled case!");
7182     };
7183 
7184     unsigned Opcode = I->getOpcode();
7185     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7186     // For Trunc, the context is the only user, which must be a StoreInst.
7187     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7188       if (I->hasOneUse())
7189         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7190           CCH = ComputeCCH(Store);
7191     }
7192     // For Z/Sext, the context is the operand, which must be a LoadInst.
7193     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7194              Opcode == Instruction::FPExt) {
7195       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7196         CCH = ComputeCCH(Load);
7197     }
7198 
7199     // We optimize the truncation of induction variables having constant
7200     // integer steps. The cost of these truncations is the same as the scalar
7201     // operation.
7202     if (isOptimizableIVTruncate(I, VF)) {
7203       auto *Trunc = cast<TruncInst>(I);
7204       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7205                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7206     }
7207 
7208     Type *SrcScalarTy = I->getOperand(0)->getType();
7209     Type *SrcVecTy =
7210         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7211     if (canTruncateToMinimalBitwidth(I, VF)) {
7212       // This cast is going to be shrunk. This may remove the cast or it might
7213       // turn it into slightly different cast. For example, if MinBW == 16,
7214       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7215       //
7216       // Calculate the modified src and dest types.
7217       Type *MinVecTy = VectorTy;
7218       if (Opcode == Instruction::Trunc) {
7219         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7220         VectorTy =
7221             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7222       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7223         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7224         VectorTy =
7225             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7226       }
7227     }
7228 
7229     assert(!VF.isScalable() && "VF is assumed to be non scalable");
7230     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7231     return N *
7232            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7233   }
7234   case Instruction::Call: {
7235     bool NeedToScalarize;
7236     CallInst *CI = cast<CallInst>(I);
7237     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7238     if (getVectorIntrinsicIDForCall(CI, TLI))
7239       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
7240     return CallCost;
7241   }
7242   case Instruction::ExtractValue: {
7243     InstructionCost ExtractCost =
7244         TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7245     assert(ExtractCost.isValid() && "Invalid cost for ExtractValue");
7246     return *(ExtractCost.getValue());
7247   }
7248   default:
7249     // The cost of executing VF copies of the scalar instruction. This opcode
7250     // is unknown. Assume that it is the same as 'mul'.
7251     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
7252                                        Instruction::Mul, VectorTy, CostKind) +
7253            getScalarizationOverhead(I, VF);
7254   } // end of switch.
7255 }
7256 
7257 char LoopVectorize::ID = 0;
7258 
7259 static const char lv_name[] = "Loop Vectorization";
7260 
7261 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7262 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7263 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7264 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7265 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7266 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7267 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7268 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7269 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7270 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7271 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7272 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7273 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7274 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7275 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7276 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7277 
7278 namespace llvm {
7279 
7280 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7281 
7282 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7283                               bool VectorizeOnlyWhenForced) {
7284   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7285 }
7286 
7287 } // end namespace llvm
7288 
7289 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7290   // Check if the pointer operand of a load or store instruction is
7291   // consecutive.
7292   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7293     return Legal->isConsecutivePtr(Ptr);
7294   return false;
7295 }
7296 
7297 void LoopVectorizationCostModel::collectValuesToIgnore() {
7298   // Ignore ephemeral values.
7299   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7300 
7301   // Ignore type-promoting instructions we identified during reduction
7302   // detection.
7303   for (auto &Reduction : Legal->getReductionVars()) {
7304     RecurrenceDescriptor &RedDes = Reduction.second;
7305     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7306     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7307   }
7308   // Ignore type-casting instructions we identified during induction
7309   // detection.
7310   for (auto &Induction : Legal->getInductionVars()) {
7311     InductionDescriptor &IndDes = Induction.second;
7312     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7313     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7314   }
7315 }
7316 
7317 void LoopVectorizationCostModel::collectInLoopReductions() {
7318   for (auto &Reduction : Legal->getReductionVars()) {
7319     PHINode *Phi = Reduction.first;
7320     RecurrenceDescriptor &RdxDesc = Reduction.second;
7321 
7322     // We don't collect reductions that are type promoted (yet).
7323     if (RdxDesc.getRecurrenceType() != Phi->getType())
7324       continue;
7325 
7326     // If the target would prefer this reduction to happen "in-loop", then we
7327     // want to record it as such.
7328     unsigned Opcode = RdxDesc.getRecurrenceBinOp();
7329     if (!PreferInLoopReductions &&
7330         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7331                                    TargetTransformInfo::ReductionFlags()))
7332       continue;
7333 
7334     // Check that we can correctly put the reductions into the loop, by
7335     // finding the chain of operations that leads from the phi to the loop
7336     // exit value.
7337     SmallVector<Instruction *, 4> ReductionOperations =
7338         RdxDesc.getReductionOpChain(Phi, TheLoop);
7339     bool InLoop = !ReductionOperations.empty();
7340     if (InLoop)
7341       InLoopReductionChains[Phi] = ReductionOperations;
7342     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7343                       << " reduction for phi: " << *Phi << "\n");
7344   }
7345 }
7346 
7347 // TODO: we could return a pair of values that specify the max VF and
7348 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7349 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7350 // doesn't have a cost model that can choose which plan to execute if
7351 // more than one is generated.
7352 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7353                                  LoopVectorizationCostModel &CM) {
7354   unsigned WidestType;
7355   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7356   return WidestVectorRegBits / WidestType;
7357 }
7358 
7359 VectorizationFactor
7360 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7361   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7362   ElementCount VF = UserVF;
7363   // Outer loop handling: They may require CFG and instruction level
7364   // transformations before even evaluating whether vectorization is profitable.
7365   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7366   // the vectorization pipeline.
7367   if (!OrigLoop->isInnermost()) {
7368     // If the user doesn't provide a vectorization factor, determine a
7369     // reasonable one.
7370     if (UserVF.isZero()) {
7371       VF = ElementCount::getFixed(
7372           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
7373       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7374 
7375       // Make sure we have a VF > 1 for stress testing.
7376       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7377         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7378                           << "overriding computed VF.\n");
7379         VF = ElementCount::getFixed(4);
7380       }
7381     }
7382     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7383     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7384            "VF needs to be a power of two");
7385     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7386                       << "VF " << VF << " to build VPlans.\n");
7387     buildVPlans(VF, VF);
7388 
7389     // For VPlan build stress testing, we bail out after VPlan construction.
7390     if (VPlanBuildStressTest)
7391       return VectorizationFactor::Disabled();
7392 
7393     return {VF, 0 /*Cost*/};
7394   }
7395 
7396   LLVM_DEBUG(
7397       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7398                 "VPlan-native path.\n");
7399   return VectorizationFactor::Disabled();
7400 }
7401 
7402 Optional<VectorizationFactor>
7403 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7404   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7405   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7406   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7407     return None;
7408 
7409   // Invalidate interleave groups if all blocks of loop will be predicated.
7410   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7411       !useMaskedInterleavedAccesses(*TTI)) {
7412     LLVM_DEBUG(
7413         dbgs()
7414         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7415            "which requires masked-interleaved support.\n");
7416     if (CM.InterleaveInfo.invalidateGroups())
7417       // Invalidating interleave groups also requires invalidating all decisions
7418       // based on them, which includes widening decisions and uniform and scalar
7419       // values.
7420       CM.invalidateCostModelingDecisions();
7421   }
7422 
7423   ElementCount MaxVF = MaybeMaxVF.getValue();
7424   assert(MaxVF.isNonZero() && "MaxVF is zero.");
7425 
7426   if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) {
7427     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7428     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7429            "VF needs to be a power of two");
7430     // Collect the instructions (and their associated costs) that will be more
7431     // profitable to scalarize.
7432     CM.selectUserVectorizationFactor(UserVF);
7433     CM.collectInLoopReductions();
7434     buildVPlansWithVPRecipes(UserVF, UserVF);
7435     LLVM_DEBUG(printPlans(dbgs()));
7436     return {{UserVF, 0}};
7437   }
7438 
7439   assert(!MaxVF.isScalable() &&
7440          "Scalable vectors not yet supported beyond this point");
7441 
7442   for (ElementCount VF = ElementCount::getFixed(1);
7443        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7444     // Collect Uniform and Scalar instructions after vectorization with VF.
7445     CM.collectUniformsAndScalars(VF);
7446 
7447     // Collect the instructions (and their associated costs) that will be more
7448     // profitable to scalarize.
7449     if (VF.isVector())
7450       CM.collectInstsToScalarize(VF);
7451   }
7452 
7453   CM.collectInLoopReductions();
7454 
7455   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7456   LLVM_DEBUG(printPlans(dbgs()));
7457   if (MaxVF.isScalar())
7458     return VectorizationFactor::Disabled();
7459 
7460   // Select the optimal vectorization factor.
7461   return CM.selectVectorizationFactor(MaxVF);
7462 }
7463 
7464 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7465   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7466                     << '\n');
7467   BestVF = VF;
7468   BestUF = UF;
7469 
7470   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7471     return !Plan->hasVF(VF);
7472   });
7473   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7474 }
7475 
7476 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7477                                            DominatorTree *DT) {
7478   // Perform the actual loop transformation.
7479 
7480   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7481   VPCallbackILV CallbackILV(ILV);
7482 
7483   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7484 
7485   VPTransformState State{*BestVF, BestUF,      LI,
7486                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7487                          &ILV,    CallbackILV};
7488   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7489   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7490   State.CanonicalIV = ILV.Induction;
7491 
7492   ILV.printDebugTracesAtStart();
7493 
7494   //===------------------------------------------------===//
7495   //
7496   // Notice: any optimization or new instruction that go
7497   // into the code below should also be implemented in
7498   // the cost-model.
7499   //
7500   //===------------------------------------------------===//
7501 
7502   // 2. Copy and widen instructions from the old loop into the new loop.
7503   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7504   VPlans.front()->execute(&State);
7505 
7506   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7507   //    predication, updating analyses.
7508   ILV.fixVectorizedLoop();
7509 
7510   ILV.printDebugTracesAtEnd();
7511 }
7512 
7513 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7514     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7515 
7516   // We create new control-flow for the vectorized loop, so the original exit
7517   // conditions will be dead after vectorization if it's only used by the
7518   // terminator
7519   SmallVector<BasicBlock*> ExitingBlocks;
7520   OrigLoop->getExitingBlocks(ExitingBlocks);
7521   for (auto *BB : ExitingBlocks) {
7522     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7523     if (!Cmp || !Cmp->hasOneUse())
7524       continue;
7525 
7526     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7527     if (!DeadInstructions.insert(Cmp).second)
7528       continue;
7529 
7530     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7531     // TODO: can recurse through operands in general
7532     for (Value *Op : Cmp->operands()) {
7533       if (isa<TruncInst>(Op) && Op->hasOneUse())
7534           DeadInstructions.insert(cast<Instruction>(Op));
7535     }
7536   }
7537 
7538   // We create new "steps" for induction variable updates to which the original
7539   // induction variables map. An original update instruction will be dead if
7540   // all its users except the induction variable are dead.
7541   auto *Latch = OrigLoop->getLoopLatch();
7542   for (auto &Induction : Legal->getInductionVars()) {
7543     PHINode *Ind = Induction.first;
7544     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7545 
7546     // If the tail is to be folded by masking, the primary induction variable,
7547     // if exists, isn't dead: it will be used for masking. Don't kill it.
7548     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7549       continue;
7550 
7551     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7552           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7553         }))
7554       DeadInstructions.insert(IndUpdate);
7555 
7556     // We record as "Dead" also the type-casting instructions we had identified
7557     // during induction analysis. We don't need any handling for them in the
7558     // vectorized loop because we have proven that, under a proper runtime
7559     // test guarding the vectorized loop, the value of the phi, and the casted
7560     // value of the phi, are the same. The last instruction in this casting chain
7561     // will get its scalar/vector/widened def from the scalar/vector/widened def
7562     // of the respective phi node. Any other casts in the induction def-use chain
7563     // have no other uses outside the phi update chain, and will be ignored.
7564     InductionDescriptor &IndDes = Induction.second;
7565     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7566     DeadInstructions.insert(Casts.begin(), Casts.end());
7567   }
7568 }
7569 
7570 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7571 
7572 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7573 
7574 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7575                                         Instruction::BinaryOps BinOp) {
7576   // When unrolling and the VF is 1, we only need to add a simple scalar.
7577   Type *Ty = Val->getType();
7578   assert(!Ty->isVectorTy() && "Val must be a scalar");
7579 
7580   if (Ty->isFloatingPointTy()) {
7581     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7582 
7583     // Floating point operations had to be 'fast' to enable the unrolling.
7584     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7585     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7586   }
7587   Constant *C = ConstantInt::get(Ty, StartIdx);
7588   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7589 }
7590 
7591 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7592   SmallVector<Metadata *, 4> MDs;
7593   // Reserve first location for self reference to the LoopID metadata node.
7594   MDs.push_back(nullptr);
7595   bool IsUnrollMetadata = false;
7596   MDNode *LoopID = L->getLoopID();
7597   if (LoopID) {
7598     // First find existing loop unrolling disable metadata.
7599     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7600       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7601       if (MD) {
7602         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7603         IsUnrollMetadata =
7604             S && S->getString().startswith("llvm.loop.unroll.disable");
7605       }
7606       MDs.push_back(LoopID->getOperand(i));
7607     }
7608   }
7609 
7610   if (!IsUnrollMetadata) {
7611     // Add runtime unroll disable metadata.
7612     LLVMContext &Context = L->getHeader()->getContext();
7613     SmallVector<Metadata *, 1> DisableOperands;
7614     DisableOperands.push_back(
7615         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7616     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7617     MDs.push_back(DisableNode);
7618     MDNode *NewLoopID = MDNode::get(Context, MDs);
7619     // Set operand 0 to refer to the loop id itself.
7620     NewLoopID->replaceOperandWith(0, NewLoopID);
7621     L->setLoopID(NewLoopID);
7622   }
7623 }
7624 
7625 //===--------------------------------------------------------------------===//
7626 // EpilogueVectorizerMainLoop
7627 //===--------------------------------------------------------------------===//
7628 
7629 /// This function is partially responsible for generating the control flow
7630 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7631 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7632   MDNode *OrigLoopID = OrigLoop->getLoopID();
7633   Loop *Lp = createVectorLoopSkeleton("");
7634 
7635   // Generate the code to check the minimum iteration count of the vector
7636   // epilogue (see below).
7637   EPI.EpilogueIterationCountCheck =
7638       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
7639   EPI.EpilogueIterationCountCheck->setName("iter.check");
7640 
7641   // Generate the code to check any assumptions that we've made for SCEV
7642   // expressions.
7643   BasicBlock *SavedPreHeader = LoopVectorPreHeader;
7644   emitSCEVChecks(Lp, LoopScalarPreHeader);
7645 
7646   // If a safety check was generated save it.
7647   if (SavedPreHeader != LoopVectorPreHeader)
7648     EPI.SCEVSafetyCheck = SavedPreHeader;
7649 
7650   // Generate the code that checks at runtime if arrays overlap. We put the
7651   // checks into a separate block to make the more common case of few elements
7652   // faster.
7653   SavedPreHeader = LoopVectorPreHeader;
7654   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
7655 
7656   // If a safety check was generated save/overwite it.
7657   if (SavedPreHeader != LoopVectorPreHeader)
7658     EPI.MemSafetyCheck = SavedPreHeader;
7659 
7660   // Generate the iteration count check for the main loop, *after* the check
7661   // for the epilogue loop, so that the path-length is shorter for the case
7662   // that goes directly through the vector epilogue. The longer-path length for
7663   // the main loop is compensated for, by the gain from vectorizing the larger
7664   // trip count. Note: the branch will get updated later on when we vectorize
7665   // the epilogue.
7666   EPI.MainLoopIterationCountCheck =
7667       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
7668 
7669   // Generate the induction variable.
7670   OldInduction = Legal->getPrimaryInduction();
7671   Type *IdxTy = Legal->getWidestInductionType();
7672   Value *StartIdx = ConstantInt::get(IdxTy, 0);
7673   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7674   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7675   EPI.VectorTripCount = CountRoundDown;
7676   Induction =
7677       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7678                               getDebugLocFromInstOrOperands(OldInduction));
7679 
7680   // Skip induction resume value creation here because they will be created in
7681   // the second pass. If we created them here, they wouldn't be used anyway,
7682   // because the vplan in the second pass still contains the inductions from the
7683   // original loop.
7684 
7685   return completeLoopSkeleton(Lp, OrigLoopID);
7686 }
7687 
7688 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7689   LLVM_DEBUG({
7690     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7691            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7692            << ", Main Loop UF:" << EPI.MainLoopUF
7693            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7694            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7695   });
7696 }
7697 
7698 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7699   DEBUG_WITH_TYPE(VerboseDebug, {
7700     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
7701   });
7702 }
7703 
7704 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
7705     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
7706   assert(L && "Expected valid Loop.");
7707   assert(Bypass && "Expected valid bypass basic block.");
7708   unsigned VFactor =
7709       ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
7710   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7711   Value *Count = getOrCreateTripCount(L);
7712   // Reuse existing vector loop preheader for TC checks.
7713   // Note that new preheader block is generated for vector loop.
7714   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7715   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7716 
7717   // Generate code to check if the loop's trip count is less than VF * UF of the
7718   // main vector loop.
7719   auto P =
7720       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7721 
7722   Value *CheckMinIters = Builder.CreateICmp(
7723       P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
7724       "min.iters.check");
7725 
7726   if (!ForEpilogue)
7727     TCCheckBlock->setName("vector.main.loop.iter.check");
7728 
7729   // Create new preheader for vector loop.
7730   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7731                                    DT, LI, nullptr, "vector.ph");
7732 
7733   if (ForEpilogue) {
7734     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7735                                  DT->getNode(Bypass)->getIDom()) &&
7736            "TC check is expected to dominate Bypass");
7737 
7738     // Update dominator for Bypass & LoopExit.
7739     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7740     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7741 
7742     LoopBypassBlocks.push_back(TCCheckBlock);
7743 
7744     // Save the trip count so we don't have to regenerate it in the
7745     // vec.epilog.iter.check. This is safe to do because the trip count
7746     // generated here dominates the vector epilog iter check.
7747     EPI.TripCount = Count;
7748   }
7749 
7750   ReplaceInstWithInst(
7751       TCCheckBlock->getTerminator(),
7752       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7753 
7754   return TCCheckBlock;
7755 }
7756 
7757 //===--------------------------------------------------------------------===//
7758 // EpilogueVectorizerEpilogueLoop
7759 //===--------------------------------------------------------------------===//
7760 
7761 /// This function is partially responsible for generating the control flow
7762 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7763 BasicBlock *
7764 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7765   MDNode *OrigLoopID = OrigLoop->getLoopID();
7766   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
7767 
7768   // Now, compare the remaining count and if there aren't enough iterations to
7769   // execute the vectorized epilogue skip to the scalar part.
7770   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7771   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7772   LoopVectorPreHeader =
7773       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7774                  LI, nullptr, "vec.epilog.ph");
7775   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
7776                                           VecEpilogueIterationCountCheck);
7777 
7778   // Adjust the control flow taking the state info from the main loop
7779   // vectorization into account.
7780   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7781          "expected this to be saved from the previous pass.");
7782   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7783       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7784 
7785   DT->changeImmediateDominator(LoopVectorPreHeader,
7786                                EPI.MainLoopIterationCountCheck);
7787 
7788   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7789       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7790 
7791   if (EPI.SCEVSafetyCheck)
7792     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7793         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7794   if (EPI.MemSafetyCheck)
7795     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7796         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7797 
7798   DT->changeImmediateDominator(
7799       VecEpilogueIterationCountCheck,
7800       VecEpilogueIterationCountCheck->getSinglePredecessor());
7801 
7802   DT->changeImmediateDominator(LoopScalarPreHeader,
7803                                EPI.EpilogueIterationCountCheck);
7804   DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
7805 
7806   // Keep track of bypass blocks, as they feed start values to the induction
7807   // phis in the scalar loop preheader.
7808   if (EPI.SCEVSafetyCheck)
7809     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7810   if (EPI.MemSafetyCheck)
7811     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7812   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7813 
7814   // Generate a resume induction for the vector epilogue and put it in the
7815   // vector epilogue preheader
7816   Type *IdxTy = Legal->getWidestInductionType();
7817   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7818                                          LoopVectorPreHeader->getFirstNonPHI());
7819   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7820   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7821                            EPI.MainLoopIterationCountCheck);
7822 
7823   // Generate the induction variable.
7824   OldInduction = Legal->getPrimaryInduction();
7825   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7826   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7827   Value *StartIdx = EPResumeVal;
7828   Induction =
7829       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7830                               getDebugLocFromInstOrOperands(OldInduction));
7831 
7832   // Generate induction resume values. These variables save the new starting
7833   // indexes for the scalar loop. They are used to test if there are any tail
7834   // iterations left once the vector loop has completed.
7835   // Note that when the vectorized epilogue is skipped due to iteration count
7836   // check, then the resume value for the induction variable comes from
7837   // the trip count of the main vector loop, hence passing the AdditionalBypass
7838   // argument.
7839   createInductionResumeValues(Lp, CountRoundDown,
7840                               {VecEpilogueIterationCountCheck,
7841                                EPI.VectorTripCount} /* AdditionalBypass */);
7842 
7843   AddRuntimeUnrollDisableMetaData(Lp);
7844   return completeLoopSkeleton(Lp, OrigLoopID);
7845 }
7846 
7847 BasicBlock *
7848 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7849     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
7850 
7851   assert(EPI.TripCount &&
7852          "Expected trip count to have been safed in the first pass.");
7853   assert(
7854       (!isa<Instruction>(EPI.TripCount) ||
7855        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7856       "saved trip count does not dominate insertion point.");
7857   Value *TC = EPI.TripCount;
7858   IRBuilder<> Builder(Insert->getTerminator());
7859   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7860 
7861   // Generate code to check if the loop's trip count is less than VF * UF of the
7862   // vector epilogue loop.
7863   auto P =
7864       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7865 
7866   Value *CheckMinIters = Builder.CreateICmp(
7867       P, Count,
7868       ConstantInt::get(Count->getType(),
7869                        EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
7870       "min.epilog.iters.check");
7871 
7872   ReplaceInstWithInst(
7873       Insert->getTerminator(),
7874       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7875 
7876   LoopBypassBlocks.push_back(Insert);
7877   return Insert;
7878 }
7879 
7880 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7881   LLVM_DEBUG({
7882     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7883            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7884            << ", Main Loop UF:" << EPI.MainLoopUF
7885            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7886            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7887   });
7888 }
7889 
7890 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7891   DEBUG_WITH_TYPE(VerboseDebug, {
7892     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
7893   });
7894 }
7895 
7896 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7897     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7898   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7899   bool PredicateAtRangeStart = Predicate(Range.Start);
7900 
7901   for (ElementCount TmpVF = Range.Start * 2;
7902        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7903     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7904       Range.End = TmpVF;
7905       break;
7906     }
7907 
7908   return PredicateAtRangeStart;
7909 }
7910 
7911 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7912 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7913 /// of VF's starting at a given VF and extending it as much as possible. Each
7914 /// vectorization decision can potentially shorten this sub-range during
7915 /// buildVPlan().
7916 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7917                                            ElementCount MaxVF) {
7918   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7919   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7920     VFRange SubRange = {VF, MaxVFPlusOne};
7921     VPlans.push_back(buildVPlan(SubRange));
7922     VF = SubRange.End;
7923   }
7924 }
7925 
7926 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7927                                          VPlanPtr &Plan) {
7928   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7929 
7930   // Look for cached value.
7931   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7932   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7933   if (ECEntryIt != EdgeMaskCache.end())
7934     return ECEntryIt->second;
7935 
7936   VPValue *SrcMask = createBlockInMask(Src, Plan);
7937 
7938   // The terminator has to be a branch inst!
7939   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7940   assert(BI && "Unexpected terminator found");
7941 
7942   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7943     return EdgeMaskCache[Edge] = SrcMask;
7944 
7945   // If source is an exiting block, we know the exit edge is dynamically dead
7946   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
7947   // adding uses of an otherwise potentially dead instruction.
7948   if (OrigLoop->isLoopExiting(Src))
7949     return EdgeMaskCache[Edge] = SrcMask;
7950 
7951   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
7952   assert(EdgeMask && "No Edge Mask found for condition");
7953 
7954   if (BI->getSuccessor(0) != Dst)
7955     EdgeMask = Builder.createNot(EdgeMask);
7956 
7957   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7958     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7959 
7960   return EdgeMaskCache[Edge] = EdgeMask;
7961 }
7962 
7963 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7964   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7965 
7966   // Look for cached value.
7967   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7968   if (BCEntryIt != BlockMaskCache.end())
7969     return BCEntryIt->second;
7970 
7971   // All-one mask is modelled as no-mask following the convention for masked
7972   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7973   VPValue *BlockMask = nullptr;
7974 
7975   if (OrigLoop->getHeader() == BB) {
7976     if (!CM.blockNeedsPredication(BB))
7977       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7978 
7979     // Create the block in mask as the first non-phi instruction in the block.
7980     VPBuilder::InsertPointGuard Guard(Builder);
7981     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
7982     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
7983 
7984     // Introduce the early-exit compare IV <= BTC to form header block mask.
7985     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7986     // Start by constructing the desired canonical IV.
7987     VPValue *IV = nullptr;
7988     if (Legal->getPrimaryInduction())
7989       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
7990     else {
7991       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7992       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
7993       IV = IVRecipe->getVPValue();
7994     }
7995     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7996     bool TailFolded = !CM.isScalarEpilogueAllowed();
7997 
7998     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7999       // While ActiveLaneMask is a binary op that consumes the loop tripcount
8000       // as a second argument, we only pass the IV here and extract the
8001       // tripcount from the transform state where codegen of the VP instructions
8002       // happen.
8003       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8004     } else {
8005       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8006     }
8007     return BlockMaskCache[BB] = BlockMask;
8008   }
8009 
8010   // This is the block mask. We OR all incoming edges.
8011   for (auto *Predecessor : predecessors(BB)) {
8012     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8013     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8014       return BlockMaskCache[BB] = EdgeMask;
8015 
8016     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8017       BlockMask = EdgeMask;
8018       continue;
8019     }
8020 
8021     BlockMask = Builder.createOr(BlockMask, EdgeMask);
8022   }
8023 
8024   return BlockMaskCache[BB] = BlockMask;
8025 }
8026 
8027 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
8028                                                 VPlanPtr &Plan) {
8029   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8030          "Must be called with either a load or store");
8031 
8032   auto willWiden = [&](ElementCount VF) -> bool {
8033     if (VF.isScalar())
8034       return false;
8035     LoopVectorizationCostModel::InstWidening Decision =
8036         CM.getWideningDecision(I, VF);
8037     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8038            "CM decision should be taken at this point.");
8039     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8040       return true;
8041     if (CM.isScalarAfterVectorization(I, VF) ||
8042         CM.isProfitableToScalarize(I, VF))
8043       return false;
8044     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8045   };
8046 
8047   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8048     return nullptr;
8049 
8050   VPValue *Mask = nullptr;
8051   if (Legal->isMaskRequired(I))
8052     Mask = createBlockInMask(I->getParent(), Plan);
8053 
8054   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
8055   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8056     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
8057 
8058   StoreInst *Store = cast<StoreInst>(I);
8059   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
8060   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
8061 }
8062 
8063 VPWidenIntOrFpInductionRecipe *
8064 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
8065   // Check if this is an integer or fp induction. If so, build the recipe that
8066   // produces its scalar and vector values.
8067   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8068   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8069       II.getKind() == InductionDescriptor::IK_FpInduction)
8070     return new VPWidenIntOrFpInductionRecipe(Phi);
8071 
8072   return nullptr;
8073 }
8074 
8075 VPWidenIntOrFpInductionRecipe *
8076 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
8077                                                 VFRange &Range) const {
8078   // Optimize the special case where the source is a constant integer
8079   // induction variable. Notice that we can only optimize the 'trunc' case
8080   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8081   // (c) other casts depend on pointer size.
8082 
8083   // Determine whether \p K is a truncation based on an induction variable that
8084   // can be optimized.
8085   auto isOptimizableIVTruncate =
8086       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8087     return [=](ElementCount VF) -> bool {
8088       return CM.isOptimizableIVTruncate(K, VF);
8089     };
8090   };
8091 
8092   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8093           isOptimizableIVTruncate(I), Range))
8094     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8095                                              I);
8096   return nullptr;
8097 }
8098 
8099 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
8100   // We know that all PHIs in non-header blocks are converted into selects, so
8101   // we don't have to worry about the insertion order and we can just use the
8102   // builder. At this point we generate the predication tree. There may be
8103   // duplications since this is a simple recursive scan, but future
8104   // optimizations will clean it up.
8105 
8106   SmallVector<VPValue *, 2> Operands;
8107   unsigned NumIncoming = Phi->getNumIncomingValues();
8108   for (unsigned In = 0; In < NumIncoming; In++) {
8109     VPValue *EdgeMask =
8110       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8111     assert((EdgeMask || NumIncoming == 1) &&
8112            "Multiple predecessors with one having a full mask");
8113     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
8114     if (EdgeMask)
8115       Operands.push_back(EdgeMask);
8116   }
8117   return new VPBlendRecipe(Phi, Operands);
8118 }
8119 
8120 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
8121                                                    VPlan &Plan) const {
8122 
8123   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8124       [this, CI](ElementCount VF) {
8125         return CM.isScalarWithPredication(CI, VF);
8126       },
8127       Range);
8128 
8129   if (IsPredicated)
8130     return nullptr;
8131 
8132   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8133   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8134              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8135              ID == Intrinsic::pseudoprobe))
8136     return nullptr;
8137 
8138   auto willWiden = [&](ElementCount VF) -> bool {
8139     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8140     // The following case may be scalarized depending on the VF.
8141     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8142     // version of the instruction.
8143     // Is it beneficial to perform intrinsic call compared to lib call?
8144     bool NeedToScalarize = false;
8145     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8146     bool UseVectorIntrinsic =
8147         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
8148     return UseVectorIntrinsic || !NeedToScalarize;
8149   };
8150 
8151   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8152     return nullptr;
8153 
8154   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
8155 }
8156 
8157 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8158   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8159          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8160   // Instruction should be widened, unless it is scalar after vectorization,
8161   // scalarization is profitable or it is predicated.
8162   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8163     return CM.isScalarAfterVectorization(I, VF) ||
8164            CM.isProfitableToScalarize(I, VF) ||
8165            CM.isScalarWithPredication(I, VF);
8166   };
8167   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8168                                                              Range);
8169 }
8170 
8171 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
8172   auto IsVectorizableOpcode = [](unsigned Opcode) {
8173     switch (Opcode) {
8174     case Instruction::Add:
8175     case Instruction::And:
8176     case Instruction::AShr:
8177     case Instruction::BitCast:
8178     case Instruction::FAdd:
8179     case Instruction::FCmp:
8180     case Instruction::FDiv:
8181     case Instruction::FMul:
8182     case Instruction::FNeg:
8183     case Instruction::FPExt:
8184     case Instruction::FPToSI:
8185     case Instruction::FPToUI:
8186     case Instruction::FPTrunc:
8187     case Instruction::FRem:
8188     case Instruction::FSub:
8189     case Instruction::ICmp:
8190     case Instruction::IntToPtr:
8191     case Instruction::LShr:
8192     case Instruction::Mul:
8193     case Instruction::Or:
8194     case Instruction::PtrToInt:
8195     case Instruction::SDiv:
8196     case Instruction::Select:
8197     case Instruction::SExt:
8198     case Instruction::Shl:
8199     case Instruction::SIToFP:
8200     case Instruction::SRem:
8201     case Instruction::Sub:
8202     case Instruction::Trunc:
8203     case Instruction::UDiv:
8204     case Instruction::UIToFP:
8205     case Instruction::URem:
8206     case Instruction::Xor:
8207     case Instruction::ZExt:
8208       return true;
8209     }
8210     return false;
8211   };
8212 
8213   if (!IsVectorizableOpcode(I->getOpcode()))
8214     return nullptr;
8215 
8216   // Success: widen this instruction.
8217   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
8218 }
8219 
8220 VPBasicBlock *VPRecipeBuilder::handleReplication(
8221     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8222     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
8223     VPlanPtr &Plan) {
8224   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8225       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8226       Range);
8227 
8228   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8229       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
8230       Range);
8231 
8232   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8233                                        IsUniform, IsPredicated);
8234   setRecipe(I, Recipe);
8235   Plan->addVPValue(I, Recipe);
8236 
8237   // Find if I uses a predicated instruction. If so, it will use its scalar
8238   // value. Avoid hoisting the insert-element which packs the scalar value into
8239   // a vector value, as that happens iff all users use the vector value.
8240   for (auto &Op : I->operands())
8241     if (auto *PredInst = dyn_cast<Instruction>(Op))
8242       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
8243         PredInst2Recipe[PredInst]->setAlsoPack(false);
8244 
8245   // Finalize the recipe for Instr, first if it is not predicated.
8246   if (!IsPredicated) {
8247     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8248     VPBB->appendRecipe(Recipe);
8249     return VPBB;
8250   }
8251   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8252   assert(VPBB->getSuccessors().empty() &&
8253          "VPBB has successors when handling predicated replication.");
8254   // Record predicated instructions for above packing optimizations.
8255   PredInst2Recipe[I] = Recipe;
8256   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8257   VPBlockUtils::insertBlockAfter(Region, VPBB);
8258   auto *RegSucc = new VPBasicBlock();
8259   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8260   return RegSucc;
8261 }
8262 
8263 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8264                                                       VPRecipeBase *PredRecipe,
8265                                                       VPlanPtr &Plan) {
8266   // Instructions marked for predication are replicated and placed under an
8267   // if-then construct to prevent side-effects.
8268 
8269   // Generate recipes to compute the block mask for this region.
8270   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8271 
8272   // Build the triangular if-then region.
8273   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8274   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8275   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8276   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8277   auto *PHIRecipe = Instr->getType()->isVoidTy()
8278                         ? nullptr
8279                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8280   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8281   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8282   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8283 
8284   // Note: first set Entry as region entry and then connect successors starting
8285   // from it in order, to propagate the "parent" of each VPBasicBlock.
8286   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8287   VPBlockUtils::connectBlocks(Pred, Exit);
8288 
8289   return Region;
8290 }
8291 
8292 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8293                                                       VFRange &Range,
8294                                                       VPlanPtr &Plan) {
8295   // First, check for specific widening recipes that deal with calls, memory
8296   // operations, inductions and Phi nodes.
8297   if (auto *CI = dyn_cast<CallInst>(Instr))
8298     return tryToWidenCall(CI, Range, *Plan);
8299 
8300   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8301     return tryToWidenMemory(Instr, Range, Plan);
8302 
8303   VPRecipeBase *Recipe;
8304   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8305     if (Phi->getParent() != OrigLoop->getHeader())
8306       return tryToBlend(Phi, Plan);
8307     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
8308       return Recipe;
8309     return new VPWidenPHIRecipe(Phi);
8310   }
8311 
8312   if (isa<TruncInst>(Instr) &&
8313       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
8314     return Recipe;
8315 
8316   if (!shouldWiden(Instr, Range))
8317     return nullptr;
8318 
8319   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8320     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
8321                                 OrigLoop);
8322 
8323   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8324     bool InvariantCond =
8325         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8326     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
8327                                    InvariantCond);
8328   }
8329 
8330   return tryToWiden(Instr, *Plan);
8331 }
8332 
8333 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8334                                                         ElementCount MaxVF) {
8335   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8336 
8337   // Collect instructions from the original loop that will become trivially dead
8338   // in the vectorized loop. We don't need to vectorize these instructions. For
8339   // example, original induction update instructions can become dead because we
8340   // separately emit induction "steps" when generating code for the new loop.
8341   // Similarly, we create a new latch condition when setting up the structure
8342   // of the new loop, so the old one can become dead.
8343   SmallPtrSet<Instruction *, 4> DeadInstructions;
8344   collectTriviallyDeadInstructions(DeadInstructions);
8345 
8346   // Add assume instructions we need to drop to DeadInstructions, to prevent
8347   // them from being added to the VPlan.
8348   // TODO: We only need to drop assumes in blocks that get flattend. If the
8349   // control flow is preserved, we should keep them.
8350   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8351   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8352 
8353   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8354   // Dead instructions do not need sinking. Remove them from SinkAfter.
8355   for (Instruction *I : DeadInstructions)
8356     SinkAfter.erase(I);
8357 
8358   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8359   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8360     VFRange SubRange = {VF, MaxVFPlusOne};
8361     VPlans.push_back(
8362         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8363     VF = SubRange.End;
8364   }
8365 }
8366 
8367 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8368     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8369     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
8370 
8371   // Hold a mapping from predicated instructions to their recipes, in order to
8372   // fix their AlsoPack behavior if a user is determined to replicate and use a
8373   // scalar instead of vector value.
8374   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
8375 
8376   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8377 
8378   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8379 
8380   // ---------------------------------------------------------------------------
8381   // Pre-construction: record ingredients whose recipes we'll need to further
8382   // process after constructing the initial VPlan.
8383   // ---------------------------------------------------------------------------
8384 
8385   // Mark instructions we'll need to sink later and their targets as
8386   // ingredients whose recipe we'll need to record.
8387   for (auto &Entry : SinkAfter) {
8388     RecipeBuilder.recordRecipeOf(Entry.first);
8389     RecipeBuilder.recordRecipeOf(Entry.second);
8390   }
8391   for (auto &Reduction : CM.getInLoopReductionChains()) {
8392     PHINode *Phi = Reduction.first;
8393     RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
8394     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8395 
8396     RecipeBuilder.recordRecipeOf(Phi);
8397     for (auto &R : ReductionOperations) {
8398       RecipeBuilder.recordRecipeOf(R);
8399       // For min/max reducitons, where we have a pair of icmp/select, we also
8400       // need to record the ICmp recipe, so it can be removed later.
8401       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8402         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8403     }
8404   }
8405 
8406   // For each interleave group which is relevant for this (possibly trimmed)
8407   // Range, add it to the set of groups to be later applied to the VPlan and add
8408   // placeholders for its members' Recipes which we'll be replacing with a
8409   // single VPInterleaveRecipe.
8410   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8411     auto applyIG = [IG, this](ElementCount VF) -> bool {
8412       return (VF.isVector() && // Query is illegal for VF == 1
8413               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8414                   LoopVectorizationCostModel::CM_Interleave);
8415     };
8416     if (!getDecisionAndClampRange(applyIG, Range))
8417       continue;
8418     InterleaveGroups.insert(IG);
8419     for (unsigned i = 0; i < IG->getFactor(); i++)
8420       if (Instruction *Member = IG->getMember(i))
8421         RecipeBuilder.recordRecipeOf(Member);
8422   };
8423 
8424   // ---------------------------------------------------------------------------
8425   // Build initial VPlan: Scan the body of the loop in a topological order to
8426   // visit each basic block after having visited its predecessor basic blocks.
8427   // ---------------------------------------------------------------------------
8428 
8429   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
8430   auto Plan = std::make_unique<VPlan>();
8431   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
8432   Plan->setEntry(VPBB);
8433 
8434   // Scan the body of the loop in a topological order to visit each basic block
8435   // after having visited its predecessor basic blocks.
8436   LoopBlocksDFS DFS(OrigLoop);
8437   DFS.perform(LI);
8438 
8439   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8440     // Relevant instructions from basic block BB will be grouped into VPRecipe
8441     // ingredients and fill a new VPBasicBlock.
8442     unsigned VPBBsForBB = 0;
8443     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8444     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
8445     VPBB = FirstVPBBForBB;
8446     Builder.setInsertPoint(VPBB);
8447 
8448     // Introduce each ingredient into VPlan.
8449     // TODO: Model and preserve debug instrinsics in VPlan.
8450     for (Instruction &I : BB->instructionsWithoutDebug()) {
8451       Instruction *Instr = &I;
8452 
8453       // First filter out irrelevant instructions, to ensure no recipes are
8454       // built for them.
8455       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8456         continue;
8457 
8458       if (auto Recipe =
8459               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
8460         for (auto *Def : Recipe->definedValues()) {
8461           auto *UV = Def->getUnderlyingValue();
8462           Plan->addVPValue(UV, Def);
8463         }
8464 
8465         RecipeBuilder.setRecipe(Instr, Recipe);
8466         VPBB->appendRecipe(Recipe);
8467         continue;
8468       }
8469 
8470       // Otherwise, if all widening options failed, Instruction is to be
8471       // replicated. This may create a successor for VPBB.
8472       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
8473           Instr, Range, VPBB, PredInst2Recipe, Plan);
8474       if (NextVPBB != VPBB) {
8475         VPBB = NextVPBB;
8476         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8477                                     : "");
8478       }
8479     }
8480   }
8481 
8482   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8483   // may also be empty, such as the last one VPBB, reflecting original
8484   // basic-blocks with no recipes.
8485   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8486   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
8487   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8488   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
8489   delete PreEntry;
8490 
8491   // ---------------------------------------------------------------------------
8492   // Transform initial VPlan: Apply previously taken decisions, in order, to
8493   // bring the VPlan to its final state.
8494   // ---------------------------------------------------------------------------
8495 
8496   // Apply Sink-After legal constraints.
8497   for (auto &Entry : SinkAfter) {
8498     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8499     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8500     Sink->moveAfter(Target);
8501   }
8502 
8503   // Interleave memory: for each Interleave Group we marked earlier as relevant
8504   // for this VPlan, replace the Recipes widening its memory instructions with a
8505   // single VPInterleaveRecipe at its insertion point.
8506   for (auto IG : InterleaveGroups) {
8507     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8508         RecipeBuilder.getRecipe(IG->getInsertPos()));
8509     SmallVector<VPValue *, 4> StoredValues;
8510     for (unsigned i = 0; i < IG->getFactor(); ++i)
8511       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
8512         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
8513 
8514     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8515                                         Recipe->getMask());
8516     VPIG->insertBefore(Recipe);
8517     unsigned J = 0;
8518     for (unsigned i = 0; i < IG->getFactor(); ++i)
8519       if (Instruction *Member = IG->getMember(i)) {
8520         if (!Member->getType()->isVoidTy()) {
8521           VPValue *OriginalV = Plan->getVPValue(Member);
8522           Plan->removeVPValueFor(Member);
8523           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8524           J++;
8525         }
8526         RecipeBuilder.getRecipe(Member)->eraseFromParent();
8527       }
8528   }
8529 
8530   // Adjust the recipes for any inloop reductions.
8531   if (Range.Start.isVector())
8532     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
8533 
8534   // Finally, if tail is folded by masking, introduce selects between the phi
8535   // and the live-out instruction of each reduction, at the end of the latch.
8536   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
8537     Builder.setInsertPoint(VPBB);
8538     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
8539     for (auto &Reduction : Legal->getReductionVars()) {
8540       if (CM.isInLoopReduction(Reduction.first))
8541         continue;
8542       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
8543       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
8544       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
8545     }
8546   }
8547 
8548   std::string PlanName;
8549   raw_string_ostream RSO(PlanName);
8550   ElementCount VF = Range.Start;
8551   Plan->addVF(VF);
8552   RSO << "Initial VPlan for VF={" << VF;
8553   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
8554     Plan->addVF(VF);
8555     RSO << "," << VF;
8556   }
8557   RSO << "},UF>=1";
8558   RSO.flush();
8559   Plan->setName(PlanName);
8560 
8561   return Plan;
8562 }
8563 
8564 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8565   // Outer loop handling: They may require CFG and instruction level
8566   // transformations before even evaluating whether vectorization is profitable.
8567   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8568   // the vectorization pipeline.
8569   assert(!OrigLoop->isInnermost());
8570   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8571 
8572   // Create new empty VPlan
8573   auto Plan = std::make_unique<VPlan>();
8574 
8575   // Build hierarchical CFG
8576   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8577   HCFGBuilder.buildHierarchicalCFG();
8578 
8579   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
8580        VF *= 2)
8581     Plan->addVF(VF);
8582 
8583   if (EnableVPlanPredication) {
8584     VPlanPredicator VPP(*Plan);
8585     VPP.predicate();
8586 
8587     // Avoid running transformation to recipes until masked code generation in
8588     // VPlan-native path is in place.
8589     return Plan;
8590   }
8591 
8592   SmallPtrSet<Instruction *, 1> DeadInstructions;
8593   VPlanTransforms::VPInstructionsToVPRecipes(
8594       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
8595   return Plan;
8596 }
8597 
8598 // Adjust the recipes for any inloop reductions. The chain of instructions
8599 // leading from the loop exit instr to the phi need to be converted to
8600 // reductions, with one operand being vector and the other being the scalar
8601 // reduction chain.
8602 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
8603     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
8604   for (auto &Reduction : CM.getInLoopReductionChains()) {
8605     PHINode *Phi = Reduction.first;
8606     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8607     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8608 
8609     // ReductionOperations are orders top-down from the phi's use to the
8610     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
8611     // which of the two operands will remain scalar and which will be reduced.
8612     // For minmax the chain will be the select instructions.
8613     Instruction *Chain = Phi;
8614     for (Instruction *R : ReductionOperations) {
8615       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
8616       RecurKind Kind = RdxDesc.getRecurrenceKind();
8617 
8618       VPValue *ChainOp = Plan->getVPValue(Chain);
8619       unsigned FirstOpId;
8620       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8621         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
8622                "Expected to replace a VPWidenSelectSC");
8623         FirstOpId = 1;
8624       } else {
8625         assert(isa<VPWidenRecipe>(WidenRecipe) &&
8626                "Expected to replace a VPWidenSC");
8627         FirstOpId = 0;
8628       }
8629       unsigned VecOpId =
8630           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
8631       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
8632 
8633       auto *CondOp = CM.foldTailByMasking()
8634                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
8635                          : nullptr;
8636       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
8637           &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
8638       WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
8639       Plan->removeVPValueFor(R);
8640       Plan->addVPValue(R, RedRecipe);
8641       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
8642       WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
8643       WidenRecipe->eraseFromParent();
8644 
8645       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8646         VPRecipeBase *CompareRecipe =
8647             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
8648         assert(isa<VPWidenRecipe>(CompareRecipe) &&
8649                "Expected to replace a VPWidenSC");
8650         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
8651                "Expected no remaining users");
8652         CompareRecipe->eraseFromParent();
8653       }
8654       Chain = R;
8655     }
8656   }
8657 }
8658 
8659 Value* LoopVectorizationPlanner::VPCallbackILV::
8660 getOrCreateVectorValues(Value *V, unsigned Part) {
8661       return ILV.getOrCreateVectorValue(V, Part);
8662 }
8663 
8664 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
8665     Value *V, const VPIteration &Instance) {
8666   return ILV.getOrCreateScalarValue(V, Instance);
8667 }
8668 
8669 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
8670                                VPSlotTracker &SlotTracker) const {
8671   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
8672   IG->getInsertPos()->printAsOperand(O, false);
8673   O << ", ";
8674   getAddr()->printAsOperand(O, SlotTracker);
8675   VPValue *Mask = getMask();
8676   if (Mask) {
8677     O << ", ";
8678     Mask->printAsOperand(O, SlotTracker);
8679   }
8680   for (unsigned i = 0; i < IG->getFactor(); ++i)
8681     if (Instruction *I = IG->getMember(i))
8682       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
8683 }
8684 
8685 void VPWidenCallRecipe::execute(VPTransformState &State) {
8686   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
8687                                   *this, State);
8688 }
8689 
8690 void VPWidenSelectRecipe::execute(VPTransformState &State) {
8691   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
8692                                     this, *this, InvariantCond, State);
8693 }
8694 
8695 void VPWidenRecipe::execute(VPTransformState &State) {
8696   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
8697 }
8698 
8699 void VPWidenGEPRecipe::execute(VPTransformState &State) {
8700   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
8701                       *this, State.UF, State.VF, IsPtrLoopInvariant,
8702                       IsIndexLoopInvariant, State);
8703 }
8704 
8705 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
8706   assert(!State.Instance && "Int or FP induction being replicated.");
8707   State.ILV->widenIntOrFpInduction(IV, Trunc);
8708 }
8709 
8710 void VPWidenPHIRecipe::execute(VPTransformState &State) {
8711   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
8712 }
8713 
8714 void VPBlendRecipe::execute(VPTransformState &State) {
8715   State.ILV->setDebugLocFromInst(State.Builder, Phi);
8716   // We know that all PHIs in non-header blocks are converted into
8717   // selects, so we don't have to worry about the insertion order and we
8718   // can just use the builder.
8719   // At this point we generate the predication tree. There may be
8720   // duplications since this is a simple recursive scan, but future
8721   // optimizations will clean it up.
8722 
8723   unsigned NumIncoming = getNumIncomingValues();
8724 
8725   // Generate a sequence of selects of the form:
8726   // SELECT(Mask3, In3,
8727   //        SELECT(Mask2, In2,
8728   //               SELECT(Mask1, In1,
8729   //                      In0)))
8730   // Note that Mask0 is never used: lanes for which no path reaches this phi and
8731   // are essentially undef are taken from In0.
8732   InnerLoopVectorizer::VectorParts Entry(State.UF);
8733   for (unsigned In = 0; In < NumIncoming; ++In) {
8734     for (unsigned Part = 0; Part < State.UF; ++Part) {
8735       // We might have single edge PHIs (blocks) - use an identity
8736       // 'select' for the first PHI operand.
8737       Value *In0 = State.get(getIncomingValue(In), Part);
8738       if (In == 0)
8739         Entry[Part] = In0; // Initialize with the first incoming value.
8740       else {
8741         // Select between the current value and the previous incoming edge
8742         // based on the incoming mask.
8743         Value *Cond = State.get(getMask(In), Part);
8744         Entry[Part] =
8745             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8746       }
8747     }
8748   }
8749   for (unsigned Part = 0; Part < State.UF; ++Part)
8750     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8751 }
8752 
8753 void VPInterleaveRecipe::execute(VPTransformState &State) {
8754   assert(!State.Instance && "Interleave group being replicated.");
8755   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
8756                                       getStoredValues(), getMask());
8757 }
8758 
8759 void VPReductionRecipe::execute(VPTransformState &State) {
8760   assert(!State.Instance && "Reduction being replicated.");
8761   for (unsigned Part = 0; Part < State.UF; ++Part) {
8762     RecurKind Kind = RdxDesc->getRecurrenceKind();
8763     Value *NewVecOp = State.get(getVecOp(), Part);
8764     if (VPValue *Cond = getCondOp()) {
8765       Value *NewCond = State.get(Cond, Part);
8766       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
8767       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
8768           Kind, VecTy->getElementType());
8769       Constant *IdenVec =
8770           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
8771       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
8772       NewVecOp = Select;
8773     }
8774     Value *NewRed =
8775         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
8776     Value *PrevInChain = State.get(getChainOp(), Part);
8777     Value *NextInChain;
8778     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8779       NextInChain =
8780           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
8781                          NewRed, PrevInChain);
8782     } else {
8783       NextInChain = State.Builder.CreateBinOp(
8784           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
8785           PrevInChain);
8786     }
8787     State.set(this, getUnderlyingInstr(), NextInChain, Part);
8788   }
8789 }
8790 
8791 void VPReplicateRecipe::execute(VPTransformState &State) {
8792   if (State.Instance) { // Generate a single instance.
8793     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
8794     State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
8795                                     *State.Instance, IsPredicated, State);
8796     // Insert scalar instance packing it into a vector.
8797     if (AlsoPack && State.VF.isVector()) {
8798       // If we're constructing lane 0, initialize to start from undef.
8799       if (State.Instance->Lane == 0) {
8800         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8801         Value *Undef = UndefValue::get(
8802             VectorType::get(getUnderlyingValue()->getType(), State.VF));
8803         State.ValueMap.setVectorValue(getUnderlyingInstr(),
8804                                       State.Instance->Part, Undef);
8805       }
8806       State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
8807                                            *State.Instance);
8808     }
8809     return;
8810   }
8811 
8812   // Generate scalar instances for all VF lanes of all UF parts, unless the
8813   // instruction is uniform inwhich case generate only the first lane for each
8814   // of the UF parts.
8815   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8816   assert((!State.VF.isScalable() || IsUniform) &&
8817          "Can't scalarize a scalable vector");
8818   for (unsigned Part = 0; Part < State.UF; ++Part)
8819     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8820       State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
8821                                       IsPredicated, State);
8822 }
8823 
8824 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8825   assert(State.Instance && "Branch on Mask works only on single instance.");
8826 
8827   unsigned Part = State.Instance->Part;
8828   unsigned Lane = State.Instance->Lane;
8829 
8830   Value *ConditionBit = nullptr;
8831   VPValue *BlockInMask = getMask();
8832   if (BlockInMask) {
8833     ConditionBit = State.get(BlockInMask, Part);
8834     if (ConditionBit->getType()->isVectorTy())
8835       ConditionBit = State.Builder.CreateExtractElement(
8836           ConditionBit, State.Builder.getInt32(Lane));
8837   } else // Block in mask is all-one.
8838     ConditionBit = State.Builder.getTrue();
8839 
8840   // Replace the temporary unreachable terminator with a new conditional branch,
8841   // whose two destinations will be set later when they are created.
8842   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8843   assert(isa<UnreachableInst>(CurrentTerminator) &&
8844          "Expected to replace unreachable terminator with conditional branch.");
8845   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8846   CondBr->setSuccessor(0, nullptr);
8847   ReplaceInstWithInst(CurrentTerminator, CondBr);
8848 }
8849 
8850 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8851   assert(State.Instance && "Predicated instruction PHI works per instance.");
8852   Instruction *ScalarPredInst =
8853       cast<Instruction>(State.get(getOperand(0), *State.Instance));
8854   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8855   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8856   assert(PredicatingBB && "Predicated block has no single predecessor.");
8857 
8858   // By current pack/unpack logic we need to generate only a single phi node: if
8859   // a vector value for the predicated instruction exists at this point it means
8860   // the instruction has vector users only, and a phi for the vector value is
8861   // needed. In this case the recipe of the predicated instruction is marked to
8862   // also do that packing, thereby "hoisting" the insert-element sequence.
8863   // Otherwise, a phi node for the scalar value is needed.
8864   unsigned Part = State.Instance->Part;
8865   Instruction *PredInst =
8866       cast<Instruction>(getOperand(0)->getUnderlyingValue());
8867   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8868     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8869     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8870     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8871     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8872     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8873     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8874   } else {
8875     Type *PredInstType = PredInst->getType();
8876     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8877     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8878     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8879     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8880   }
8881 }
8882 
8883 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8884   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
8885   State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
8886                                         StoredValue ? nullptr : getVPValue(),
8887                                         getAddr(), StoredValue, getMask());
8888 }
8889 
8890 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8891 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8892 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8893 // for predication.
8894 static ScalarEpilogueLowering getScalarEpilogueLowering(
8895     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8896     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8897     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8898     LoopVectorizationLegality &LVL) {
8899   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8900   // don't look at hints or options, and don't request a scalar epilogue.
8901   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8902   // LoopAccessInfo (due to code dependency and not being able to reliably get
8903   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8904   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8905   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8906   // back to the old way and vectorize with versioning when forced. See D81345.)
8907   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8908                                                       PGSOQueryType::IRPass) &&
8909                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8910     return CM_ScalarEpilogueNotAllowedOptSize;
8911 
8912   // 2) If set, obey the directives
8913   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
8914     switch (PreferPredicateOverEpilogue) {
8915     case PreferPredicateTy::ScalarEpilogue:
8916       return CM_ScalarEpilogueAllowed;
8917     case PreferPredicateTy::PredicateElseScalarEpilogue:
8918       return CM_ScalarEpilogueNotNeededUsePredicate;
8919     case PreferPredicateTy::PredicateOrDontVectorize:
8920       return CM_ScalarEpilogueNotAllowedUsePredicate;
8921     };
8922   }
8923 
8924   // 3) If set, obey the hints
8925   switch (Hints.getPredicate()) {
8926   case LoopVectorizeHints::FK_Enabled:
8927     return CM_ScalarEpilogueNotNeededUsePredicate;
8928   case LoopVectorizeHints::FK_Disabled:
8929     return CM_ScalarEpilogueAllowed;
8930   };
8931 
8932   // 4) if the TTI hook indicates this is profitable, request predication.
8933   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8934                                        LVL.getLAI()))
8935     return CM_ScalarEpilogueNotNeededUsePredicate;
8936 
8937   return CM_ScalarEpilogueAllowed;
8938 }
8939 
8940 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
8941                            unsigned Part) {
8942   set(Def, V, Part);
8943   ILV->setVectorValue(IRDef, Part, V);
8944 }
8945 
8946 // Process the loop in the VPlan-native vectorization path. This path builds
8947 // VPlan upfront in the vectorization pipeline, which allows to apply
8948 // VPlan-to-VPlan transformations from the very beginning without modifying the
8949 // input LLVM IR.
8950 static bool processLoopInVPlanNativePath(
8951     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8952     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8953     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8954     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8955     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8956 
8957   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
8958     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8959     return false;
8960   }
8961   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8962   Function *F = L->getHeader()->getParent();
8963   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8964 
8965   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8966       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8967 
8968   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8969                                 &Hints, IAI);
8970   // Use the planner for outer loop vectorization.
8971   // TODO: CM is not used at this point inside the planner. Turn CM into an
8972   // optional argument if we don't need it in the future.
8973   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8974 
8975   // Get user vectorization factor.
8976   ElementCount UserVF = Hints.getWidth();
8977 
8978   // Plan how to best vectorize, return the best VF and its cost.
8979   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
8980 
8981   // If we are stress testing VPlan builds, do not attempt to generate vector
8982   // code. Masked vector code generation support will follow soon.
8983   // Also, do not attempt to vectorize if no vector code will be produced.
8984   if (VPlanBuildStressTest || EnableVPlanPredication ||
8985       VectorizationFactor::Disabled() == VF)
8986     return false;
8987 
8988   LVP.setBestPlan(VF.Width, 1);
8989 
8990   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8991                          &CM, BFI, PSI);
8992   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8993                     << L->getHeader()->getParent()->getName() << "\"\n");
8994   LVP.executePlan(LB, DT);
8995 
8996   // Mark the loop as already vectorized to avoid vectorizing again.
8997   Hints.setAlreadyVectorized();
8998 
8999   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9000   return true;
9001 }
9002 
9003 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9004     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9005                                !EnableLoopInterleaving),
9006       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9007                               !EnableLoopVectorization) {}
9008 
9009 bool LoopVectorizePass::processLoop(Loop *L) {
9010   assert((EnableVPlanNativePath || L->isInnermost()) &&
9011          "VPlan-native path is not enabled. Only process inner loops.");
9012 
9013 #ifndef NDEBUG
9014   const std::string DebugLocStr = getDebugLocString(L);
9015 #endif /* NDEBUG */
9016 
9017   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
9018                     << L->getHeader()->getParent()->getName() << "\" from "
9019                     << DebugLocStr << "\n");
9020 
9021   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
9022 
9023   LLVM_DEBUG(
9024       dbgs() << "LV: Loop hints:"
9025              << " force="
9026              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9027                      ? "disabled"
9028                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9029                             ? "enabled"
9030                             : "?"))
9031              << " width=" << Hints.getWidth()
9032              << " unroll=" << Hints.getInterleave() << "\n");
9033 
9034   // Function containing loop
9035   Function *F = L->getHeader()->getParent();
9036 
9037   // Looking at the diagnostic output is the only way to determine if a loop
9038   // was vectorized (other than looking at the IR or machine code), so it
9039   // is important to generate an optimization remark for each loop. Most of
9040   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9041   // generated as OptimizationRemark and OptimizationRemarkMissed are
9042   // less verbose reporting vectorized loops and unvectorized loops that may
9043   // benefit from vectorization, respectively.
9044 
9045   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9046     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9047     return false;
9048   }
9049 
9050   PredicatedScalarEvolution PSE(*SE, *L);
9051 
9052   // Check if it is legal to vectorize the loop.
9053   LoopVectorizationRequirements Requirements(*ORE);
9054   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
9055                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9056   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9057     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9058     Hints.emitRemarkWithHints();
9059     return false;
9060   }
9061 
9062   // Check the function attributes and profiles to find out if this function
9063   // should be optimized for size.
9064   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9065       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
9066 
9067   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9068   // here. They may require CFG and instruction level transformations before
9069   // even evaluating whether vectorization is profitable. Since we cannot modify
9070   // the incoming IR, we need to build VPlan upfront in the vectorization
9071   // pipeline.
9072   if (!L->isInnermost())
9073     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9074                                         ORE, BFI, PSI, Hints);
9075 
9076   assert(L->isInnermost() && "Inner loop expected.");
9077 
9078   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9079   // count by optimizing for size, to minimize overheads.
9080   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9081   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9082     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9083                       << "This loop is worth vectorizing only if no scalar "
9084                       << "iteration overheads are incurred.");
9085     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9086       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9087     else {
9088       LLVM_DEBUG(dbgs() << "\n");
9089       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9090     }
9091   }
9092 
9093   // Check the function attributes to see if implicit floats are allowed.
9094   // FIXME: This check doesn't seem possibly correct -- what if the loop is
9095   // an integer loop and the vector instructions selected are purely integer
9096   // vector instructions?
9097   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9098     reportVectorizationFailure(
9099         "Can't vectorize when the NoImplicitFloat attribute is used",
9100         "loop not vectorized due to NoImplicitFloat attribute",
9101         "NoImplicitFloat", ORE, L);
9102     Hints.emitRemarkWithHints();
9103     return false;
9104   }
9105 
9106   // Check if the target supports potentially unsafe FP vectorization.
9107   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9108   // for the target we're vectorizing for, to make sure none of the
9109   // additional fp-math flags can help.
9110   if (Hints.isPotentiallyUnsafe() &&
9111       TTI->isFPVectorizationPotentiallyUnsafe()) {
9112     reportVectorizationFailure(
9113         "Potentially unsafe FP op prevents vectorization",
9114         "loop not vectorized due to unsafe FP support.",
9115         "UnsafeFP", ORE, L);
9116     Hints.emitRemarkWithHints();
9117     return false;
9118   }
9119 
9120   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9121   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9122 
9123   // If an override option has been passed in for interleaved accesses, use it.
9124   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9125     UseInterleaved = EnableInterleavedMemAccesses;
9126 
9127   // Analyze interleaved memory accesses.
9128   if (UseInterleaved) {
9129     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9130   }
9131 
9132   // Use the cost model.
9133   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9134                                 F, &Hints, IAI);
9135   CM.collectValuesToIgnore();
9136 
9137   // Use the planner for vectorization.
9138   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
9139 
9140   // Get user vectorization factor and interleave count.
9141   ElementCount UserVF = Hints.getWidth();
9142   unsigned UserIC = Hints.getInterleave();
9143 
9144   // Plan how to best vectorize, return the best VF and its cost.
9145   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9146 
9147   VectorizationFactor VF = VectorizationFactor::Disabled();
9148   unsigned IC = 1;
9149 
9150   if (MaybeVF) {
9151     VF = *MaybeVF;
9152     // Select the interleave count.
9153     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9154   }
9155 
9156   // Identify the diagnostic messages that should be produced.
9157   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9158   bool VectorizeLoop = true, InterleaveLoop = true;
9159   if (Requirements.doesNotMeet(F, L, Hints)) {
9160     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
9161                          "requirements.\n");
9162     Hints.emitRemarkWithHints();
9163     return false;
9164   }
9165 
9166   if (VF.Width.isScalar()) {
9167     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9168     VecDiagMsg = std::make_pair(
9169         "VectorizationNotBeneficial",
9170         "the cost-model indicates that vectorization is not beneficial");
9171     VectorizeLoop = false;
9172   }
9173 
9174   if (!MaybeVF && UserIC > 1) {
9175     // Tell the user interleaving was avoided up-front, despite being explicitly
9176     // requested.
9177     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9178                          "interleaving should be avoided up front\n");
9179     IntDiagMsg = std::make_pair(
9180         "InterleavingAvoided",
9181         "Ignoring UserIC, because interleaving was avoided up front");
9182     InterleaveLoop = false;
9183   } else if (IC == 1 && UserIC <= 1) {
9184     // Tell the user interleaving is not beneficial.
9185     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9186     IntDiagMsg = std::make_pair(
9187         "InterleavingNotBeneficial",
9188         "the cost-model indicates that interleaving is not beneficial");
9189     InterleaveLoop = false;
9190     if (UserIC == 1) {
9191       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9192       IntDiagMsg.second +=
9193           " and is explicitly disabled or interleave count is set to 1";
9194     }
9195   } else if (IC > 1 && UserIC == 1) {
9196     // Tell the user interleaving is beneficial, but it explicitly disabled.
9197     LLVM_DEBUG(
9198         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9199     IntDiagMsg = std::make_pair(
9200         "InterleavingBeneficialButDisabled",
9201         "the cost-model indicates that interleaving is beneficial "
9202         "but is explicitly disabled or interleave count is set to 1");
9203     InterleaveLoop = false;
9204   }
9205 
9206   // Override IC if user provided an interleave count.
9207   IC = UserIC > 0 ? UserIC : IC;
9208 
9209   // Emit diagnostic messages, if any.
9210   const char *VAPassName = Hints.vectorizeAnalysisPassName();
9211   if (!VectorizeLoop && !InterleaveLoop) {
9212     // Do not vectorize or interleaving the loop.
9213     ORE->emit([&]() {
9214       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9215                                       L->getStartLoc(), L->getHeader())
9216              << VecDiagMsg.second;
9217     });
9218     ORE->emit([&]() {
9219       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9220                                       L->getStartLoc(), L->getHeader())
9221              << IntDiagMsg.second;
9222     });
9223     return false;
9224   } else if (!VectorizeLoop && InterleaveLoop) {
9225     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9226     ORE->emit([&]() {
9227       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9228                                         L->getStartLoc(), L->getHeader())
9229              << VecDiagMsg.second;
9230     });
9231   } else if (VectorizeLoop && !InterleaveLoop) {
9232     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9233                       << ") in " << DebugLocStr << '\n');
9234     ORE->emit([&]() {
9235       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9236                                         L->getStartLoc(), L->getHeader())
9237              << IntDiagMsg.second;
9238     });
9239   } else if (VectorizeLoop && InterleaveLoop) {
9240     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9241                       << ") in " << DebugLocStr << '\n');
9242     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9243   }
9244 
9245   LVP.setBestPlan(VF.Width, IC);
9246 
9247   using namespace ore;
9248   bool DisableRuntimeUnroll = false;
9249   MDNode *OrigLoopID = L->getLoopID();
9250 
9251   if (!VectorizeLoop) {
9252     assert(IC > 1 && "interleave count should not be 1 or 0");
9253     // If we decided that it is not legal to vectorize the loop, then
9254     // interleave it.
9255     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
9256                                BFI, PSI);
9257     LVP.executePlan(Unroller, DT);
9258 
9259     ORE->emit([&]() {
9260       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9261                                 L->getHeader())
9262              << "interleaved loop (interleaved count: "
9263              << NV("InterleaveCount", IC) << ")";
9264     });
9265   } else {
9266     // If we decided that it is *legal* to vectorize the loop, then do it.
9267 
9268     // Consider vectorizing the epilogue too if it's profitable.
9269     VectorizationFactor EpilogueVF =
9270       CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
9271     if (EpilogueVF.Width.isVector()) {
9272 
9273       // The first pass vectorizes the main loop and creates a scalar epilogue
9274       // to be vectorized by executing the plan (potentially with a different
9275       // factor) again shortly afterwards.
9276       EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
9277                                         EpilogueVF.Width.getKnownMinValue(), 1);
9278       EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
9279                                          &LVL, &CM, BFI, PSI);
9280 
9281       LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
9282       LVP.executePlan(MainILV, DT);
9283       ++LoopsVectorized;
9284 
9285       simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9286       formLCSSARecursively(*L, *DT, LI, SE);
9287 
9288       // Second pass vectorizes the epilogue and adjusts the control flow
9289       // edges from the first pass.
9290       LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
9291       EPI.MainLoopVF = EPI.EpilogueVF;
9292       EPI.MainLoopUF = EPI.EpilogueUF;
9293       EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9294                                                ORE, EPI, &LVL, &CM, BFI, PSI);
9295       LVP.executePlan(EpilogILV, DT);
9296       ++LoopsEpilogueVectorized;
9297 
9298       if (!MainILV.areSafetyChecksAdded())
9299         DisableRuntimeUnroll = true;
9300     } else {
9301       InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
9302                              &LVL, &CM, BFI, PSI);
9303       LVP.executePlan(LB, DT);
9304       ++LoopsVectorized;
9305 
9306       // Add metadata to disable runtime unrolling a scalar loop when there are
9307       // no runtime checks about strides and memory. A scalar loop that is
9308       // rarely used is not worth unrolling.
9309       if (!LB.areSafetyChecksAdded())
9310         DisableRuntimeUnroll = true;
9311     }
9312 
9313     // Report the vectorization decision.
9314     ORE->emit([&]() {
9315       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
9316                                 L->getHeader())
9317              << "vectorized loop (vectorization width: "
9318              << NV("VectorizationFactor", VF.Width)
9319              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
9320     });
9321   }
9322 
9323   Optional<MDNode *> RemainderLoopID =
9324       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
9325                                       LLVMLoopVectorizeFollowupEpilogue});
9326   if (RemainderLoopID.hasValue()) {
9327     L->setLoopID(RemainderLoopID.getValue());
9328   } else {
9329     if (DisableRuntimeUnroll)
9330       AddRuntimeUnrollDisableMetaData(L);
9331 
9332     // Mark the loop as already vectorized to avoid vectorizing again.
9333     Hints.setAlreadyVectorized();
9334   }
9335 
9336   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9337   return true;
9338 }
9339 
9340 LoopVectorizeResult LoopVectorizePass::runImpl(
9341     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
9342     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
9343     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
9344     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
9345     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
9346   SE = &SE_;
9347   LI = &LI_;
9348   TTI = &TTI_;
9349   DT = &DT_;
9350   BFI = &BFI_;
9351   TLI = TLI_;
9352   AA = &AA_;
9353   AC = &AC_;
9354   GetLAA = &GetLAA_;
9355   DB = &DB_;
9356   ORE = &ORE_;
9357   PSI = PSI_;
9358 
9359   // Don't attempt if
9360   // 1. the target claims to have no vector registers, and
9361   // 2. interleaving won't help ILP.
9362   //
9363   // The second condition is necessary because, even if the target has no
9364   // vector registers, loop vectorization may still enable scalar
9365   // interleaving.
9366   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9367       TTI->getMaxInterleaveFactor(1) < 2)
9368     return LoopVectorizeResult(false, false);
9369 
9370   bool Changed = false, CFGChanged = false;
9371 
9372   // The vectorizer requires loops to be in simplified form.
9373   // Since simplification may add new inner loops, it has to run before the
9374   // legality and profitability checks. This means running the loop vectorizer
9375   // will simplify all loops, regardless of whether anything end up being
9376   // vectorized.
9377   for (auto &L : *LI)
9378     Changed |= CFGChanged |=
9379         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9380 
9381   // Build up a worklist of inner-loops to vectorize. This is necessary as
9382   // the act of vectorizing or partially unrolling a loop creates new loops
9383   // and can invalidate iterators across the loops.
9384   SmallVector<Loop *, 8> Worklist;
9385 
9386   for (Loop *L : *LI)
9387     collectSupportedLoops(*L, LI, ORE, Worklist);
9388 
9389   LoopsAnalyzed += Worklist.size();
9390 
9391   // Now walk the identified inner loops.
9392   while (!Worklist.empty()) {
9393     Loop *L = Worklist.pop_back_val();
9394 
9395     // For the inner loops we actually process, form LCSSA to simplify the
9396     // transform.
9397     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
9398 
9399     Changed |= CFGChanged |= processLoop(L);
9400   }
9401 
9402   // Process each loop nest in the function.
9403   return LoopVectorizeResult(Changed, CFGChanged);
9404 }
9405 
9406 PreservedAnalyses LoopVectorizePass::run(Function &F,
9407                                          FunctionAnalysisManager &AM) {
9408     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
9409     auto &LI = AM.getResult<LoopAnalysis>(F);
9410     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
9411     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
9412     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
9413     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
9414     auto &AA = AM.getResult<AAManager>(F);
9415     auto &AC = AM.getResult<AssumptionAnalysis>(F);
9416     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
9417     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
9418     MemorySSA *MSSA = EnableMSSALoopDependency
9419                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
9420                           : nullptr;
9421 
9422     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
9423     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
9424         [&](Loop &L) -> const LoopAccessInfo & {
9425       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
9426                                         TLI, TTI, nullptr, MSSA};
9427       return LAM.getResult<LoopAccessAnalysis>(L, AR);
9428     };
9429     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
9430     ProfileSummaryInfo *PSI =
9431         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
9432     LoopVectorizeResult Result =
9433         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
9434     if (!Result.MadeAnyChange)
9435       return PreservedAnalyses::all();
9436     PreservedAnalyses PA;
9437 
9438     // We currently do not preserve loopinfo/dominator analyses with outer loop
9439     // vectorization. Until this is addressed, mark these analyses as preserved
9440     // only for non-VPlan-native path.
9441     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
9442     if (!EnableVPlanNativePath) {
9443       PA.preserve<LoopAnalysis>();
9444       PA.preserve<DominatorTreeAnalysis>();
9445     }
9446     PA.preserve<BasicAA>();
9447     PA.preserve<GlobalsAA>();
9448     if (!Result.MadeCFGChange)
9449       PA.preserveSet<CFGAnalyses>();
9450     return PA;
9451 }
9452