1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 #ifndef NDEBUG
161 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
162 #endif
163 
164 /// @{
165 /// Metadata attribute names
166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167 const char LLVMLoopVectorizeFollowupVectorized[] =
168     "llvm.loop.vectorize.followup_vectorized";
169 const char LLVMLoopVectorizeFollowupEpilogue[] =
170     "llvm.loop.vectorize.followup_epilogue";
171 /// @}
172 
173 STATISTIC(LoopsVectorized, "Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176 
177 static cl::opt<bool> EnableEpilogueVectorization(
178     "enable-epilogue-vectorization", cl::init(false), cl::Hidden,
179     cl::desc("Enable vectorization of epilogue loops."));
180 
181 static cl::opt<unsigned> EpilogueVectorizationForceVF(
182     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183     cl::desc("When epilogue vectorization is enabled, and a value greater than "
184              "1 is specified, forces the given VF for all applicable epilogue "
185              "loops."));
186 
187 static cl::opt<unsigned> EpilogueVectorizationMinVF(
188     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189     cl::desc("Only loops with vectorization factor equal to or larger than "
190              "the specified value are considered for epilogue vectorization."));
191 
192 /// Loops with a known constant trip count below this number are vectorized only
193 /// if no scalar iteration overheads are incurred.
194 static cl::opt<unsigned> TinyTripCountVectorThreshold(
195     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196     cl::desc("Loops with a constant trip count that is smaller than this "
197              "value are vectorized only if no scalar iteration overheads "
198              "are incurred."));
199 
200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
201 // that predication is preferred, and this lists all options. I.e., the
202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
203 // and predicate the instructions accordingly. If tail-folding fails, there are
204 // different fallback strategies depending on these values:
205 namespace PreferPredicateTy {
206   enum Option {
207     ScalarEpilogue = 0,
208     PredicateElseScalarEpilogue,
209     PredicateOrDontVectorize
210   };
211 } // namespace PreferPredicateTy
212 
213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
214     "prefer-predicate-over-epilogue",
215     cl::init(PreferPredicateTy::ScalarEpilogue),
216     cl::Hidden,
217     cl::desc("Tail-folding and predication preferences over creating a scalar "
218              "epilogue loop."),
219     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
220                          "scalar-epilogue",
221                          "Don't tail-predicate loops, create scalar epilogue"),
222               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
223                          "predicate-else-scalar-epilogue",
224                          "prefer tail-folding, create scalar epilogue if tail "
225                          "folding fails."),
226               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
227                          "predicate-dont-vectorize",
228                          "prefers tail-folding, don't attempt vectorization if "
229                          "tail-folding fails.")));
230 
231 static cl::opt<bool> MaximizeBandwidth(
232     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
233     cl::desc("Maximize bandwidth when selecting vectorization factor which "
234              "will be determined by the smallest type in loop."));
235 
236 static cl::opt<bool> EnableInterleavedMemAccesses(
237     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
238     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
239 
240 /// An interleave-group may need masking if it resides in a block that needs
241 /// predication, or in order to mask away gaps.
242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
243     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
245 
246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
247     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
248     cl::desc("We don't interleave loops with a estimated constant trip count "
249              "below this number"));
250 
251 static cl::opt<unsigned> ForceTargetNumScalarRegs(
252     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
253     cl::desc("A flag that overrides the target's number of scalar registers."));
254 
255 static cl::opt<unsigned> ForceTargetNumVectorRegs(
256     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of vector registers."));
258 
259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
260     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's max interleave factor for "
262              "scalar loops."));
263 
264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
265     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
266     cl::desc("A flag that overrides the target's max interleave factor for "
267              "vectorized loops."));
268 
269 static cl::opt<unsigned> ForceTargetInstructionCost(
270     "force-target-instruction-cost", cl::init(0), cl::Hidden,
271     cl::desc("A flag that overrides the target's expected cost for "
272              "an instruction to a single constant value. Mostly "
273              "useful for getting consistent testing."));
274 
275 static cl::opt<unsigned> SmallLoopCost(
276     "small-loop-cost", cl::init(20), cl::Hidden,
277     cl::desc(
278         "The cost of a loop that is considered 'small' by the interleaver."));
279 
280 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
281     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
282     cl::desc("Enable the use of the block frequency analysis to access PGO "
283              "heuristics minimizing code growth in cold regions and being more "
284              "aggressive in hot regions."));
285 
286 // Runtime interleave loops for load/store throughput.
287 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
288     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
289     cl::desc(
290         "Enable runtime interleaving until load/store ports are saturated"));
291 
292 /// Interleave small loops with scalar reductions.
293 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
294     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
295     cl::desc("Enable interleaving for loops with small iteration counts that "
296              "contain scalar reductions to expose ILP."));
297 
298 /// The number of stores in a loop that are allowed to need predication.
299 static cl::opt<unsigned> NumberOfStoresToPredicate(
300     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
301     cl::desc("Max number of stores to be predicated behind an if."));
302 
303 static cl::opt<bool> EnableIndVarRegisterHeur(
304     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
305     cl::desc("Count the induction variable only once when interleaving"));
306 
307 static cl::opt<bool> EnableCondStoresVectorization(
308     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
309     cl::desc("Enable if predication of stores during vectorization."));
310 
311 static cl::opt<unsigned> MaxNestedScalarReductionIC(
312     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
313     cl::desc("The maximum interleave count to use when interleaving a scalar "
314              "reduction in a nested loop."));
315 
316 static cl::opt<bool>
317     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
318                            cl::Hidden,
319                            cl::desc("Prefer in-loop vector reductions, "
320                                     "overriding the targets preference."));
321 
322 static cl::opt<bool> PreferPredicatedReductionSelect(
323     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
324     cl::desc(
325         "Prefer predicating a reduction operation over an after loop select."));
326 
327 cl::opt<bool> EnableVPlanNativePath(
328     "enable-vplan-native-path", cl::init(false), cl::Hidden,
329     cl::desc("Enable VPlan-native vectorization path with "
330              "support for outer loop vectorization."));
331 
332 // FIXME: Remove this switch once we have divergence analysis. Currently we
333 // assume divergent non-backedge branches when this switch is true.
334 cl::opt<bool> EnableVPlanPredication(
335     "enable-vplan-predication", cl::init(false), cl::Hidden,
336     cl::desc("Enable VPlan-native vectorization path predicator with "
337              "support for outer loop vectorization."));
338 
339 // This flag enables the stress testing of the VPlan H-CFG construction in the
340 // VPlan-native vectorization path. It must be used in conjuction with
341 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
342 // verification of the H-CFGs built.
343 static cl::opt<bool> VPlanBuildStressTest(
344     "vplan-build-stress-test", cl::init(false), cl::Hidden,
345     cl::desc(
346         "Build VPlan for every supported loop nest in the function and bail "
347         "out right after the build (stress test the VPlan H-CFG construction "
348         "in the VPlan-native vectorization path)."));
349 
350 cl::opt<bool> llvm::EnableLoopInterleaving(
351     "interleave-loops", cl::init(true), cl::Hidden,
352     cl::desc("Enable loop interleaving in Loop vectorization passes"));
353 cl::opt<bool> llvm::EnableLoopVectorization(
354     "vectorize-loops", cl::init(true), cl::Hidden,
355     cl::desc("Run the Loop vectorization passes"));
356 
357 /// A helper function that returns the type of loaded or stored value.
358 static Type *getMemInstValueType(Value *I) {
359   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
360          "Expected Load or Store instruction");
361   if (auto *LI = dyn_cast<LoadInst>(I))
362     return LI->getType();
363   return cast<StoreInst>(I)->getValueOperand()->getType();
364 }
365 
366 /// A helper function that returns true if the given type is irregular. The
367 /// type is irregular if its allocated size doesn't equal the store size of an
368 /// element of the corresponding vector type at the given vectorization factor.
369 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
370   assert(!VF.isScalable() && "scalable vectors not yet supported.");
371   // Determine if an array of VF elements of type Ty is "bitcast compatible"
372   // with a <VF x Ty> vector.
373   if (VF.isVector()) {
374     auto *VectorTy = VectorType::get(Ty, VF);
375     return TypeSize::get(VF.getKnownMinValue() *
376                              DL.getTypeAllocSize(Ty).getFixedValue(),
377                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
378   }
379 
380   // If the vectorization factor is one, we just check if an array of type Ty
381   // requires padding between elements.
382   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
383 }
384 
385 /// A helper function that returns the reciprocal of the block probability of
386 /// predicated blocks. If we return X, we are assuming the predicated block
387 /// will execute once for every X iterations of the loop header.
388 ///
389 /// TODO: We should use actual block probability here, if available. Currently,
390 ///       we always assume predicated blocks have a 50% chance of executing.
391 static unsigned getReciprocalPredBlockProb() { return 2; }
392 
393 /// A helper function that adds a 'fast' flag to floating-point operations.
394 static Value *addFastMathFlag(Value *V) {
395   if (isa<FPMathOperator>(V))
396     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
397   return V;
398 }
399 
400 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
401   if (isa<FPMathOperator>(V))
402     cast<Instruction>(V)->setFastMathFlags(FMF);
403   return V;
404 }
405 
406 /// A helper function that returns an integer or floating-point constant with
407 /// value C.
408 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
409   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
410                            : ConstantFP::get(Ty, C);
411 }
412 
413 /// Returns "best known" trip count for the specified loop \p L as defined by
414 /// the following procedure:
415 ///   1) Returns exact trip count if it is known.
416 ///   2) Returns expected trip count according to profile data if any.
417 ///   3) Returns upper bound estimate if it is known.
418 ///   4) Returns None if all of the above failed.
419 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
420   // Check if exact trip count is known.
421   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
422     return ExpectedTC;
423 
424   // Check if there is an expected trip count available from profile data.
425   if (LoopVectorizeWithBlockFrequency)
426     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
427       return EstimatedTC;
428 
429   // Check if upper bound estimate is known.
430   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
431     return ExpectedTC;
432 
433   return None;
434 }
435 
436 namespace llvm {
437 
438 /// InnerLoopVectorizer vectorizes loops which contain only one basic
439 /// block to a specified vectorization factor (VF).
440 /// This class performs the widening of scalars into vectors, or multiple
441 /// scalars. This class also implements the following features:
442 /// * It inserts an epilogue loop for handling loops that don't have iteration
443 ///   counts that are known to be a multiple of the vectorization factor.
444 /// * It handles the code generation for reduction variables.
445 /// * Scalarization (implementation using scalars) of un-vectorizable
446 ///   instructions.
447 /// InnerLoopVectorizer does not perform any vectorization-legality
448 /// checks, and relies on the caller to check for the different legality
449 /// aspects. The InnerLoopVectorizer relies on the
450 /// LoopVectorizationLegality class to provide information about the induction
451 /// and reduction variables that were found to a given vectorization factor.
452 class InnerLoopVectorizer {
453 public:
454   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
455                       LoopInfo *LI, DominatorTree *DT,
456                       const TargetLibraryInfo *TLI,
457                       const TargetTransformInfo *TTI, AssumptionCache *AC,
458                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
459                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
460                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
461                       ProfileSummaryInfo *PSI)
462       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
463         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
464         Builder(PSE.getSE()->getContext()),
465         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
466         BFI(BFI), PSI(PSI) {
467     // Query this against the original loop and save it here because the profile
468     // of the original loop header may change as the transformation happens.
469     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
470         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
471   }
472 
473   virtual ~InnerLoopVectorizer() = default;
474 
475   /// Create a new empty loop that will contain vectorized instructions later
476   /// on, while the old loop will be used as the scalar remainder. Control flow
477   /// is generated around the vectorized (and scalar epilogue) loops consisting
478   /// of various checks and bypasses. Return the pre-header block of the new
479   /// loop.
480   /// In the case of epilogue vectorization, this function is overriden to
481   /// handle the more complex control flow around the loops.
482   virtual BasicBlock *createVectorizedLoopSkeleton();
483 
484   /// Widen a single instruction within the innermost loop.
485   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
486                         VPTransformState &State);
487 
488   /// Widen a single call instruction within the innermost loop.
489   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
490                             VPTransformState &State);
491 
492   /// Widen a single select instruction within the innermost loop.
493   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
494                               bool InvariantCond, VPTransformState &State);
495 
496   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
497   void fixVectorizedLoop();
498 
499   // Return true if any runtime check is added.
500   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
501 
502   /// A type for vectorized values in the new loop. Each value from the
503   /// original loop, when vectorized, is represented by UF vector values in the
504   /// new unrolled loop, where UF is the unroll factor.
505   using VectorParts = SmallVector<Value *, 2>;
506 
507   /// Vectorize a single GetElementPtrInst based on information gathered and
508   /// decisions taken during planning.
509   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
510                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
511                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
512 
513   /// Vectorize a single PHINode in a block. This method handles the induction
514   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
515   /// arbitrary length vectors.
516   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
517 
518   /// A helper function to scalarize a single Instruction in the innermost loop.
519   /// Generates a sequence of scalar instances for each lane between \p MinLane
520   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
521   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
522   /// Instr's operands.
523   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
524                             const VPIteration &Instance, bool IfPredicateInstr,
525                             VPTransformState &State);
526 
527   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
528   /// is provided, the integer induction variable will first be truncated to
529   /// the corresponding type.
530   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
531 
532   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
533   /// vector or scalar value on-demand if one is not yet available. When
534   /// vectorizing a loop, we visit the definition of an instruction before its
535   /// uses. When visiting the definition, we either vectorize or scalarize the
536   /// instruction, creating an entry for it in the corresponding map. (In some
537   /// cases, such as induction variables, we will create both vector and scalar
538   /// entries.) Then, as we encounter uses of the definition, we derive values
539   /// for each scalar or vector use unless such a value is already available.
540   /// For example, if we scalarize a definition and one of its uses is vector,
541   /// we build the required vector on-demand with an insertelement sequence
542   /// when visiting the use. Otherwise, if the use is scalar, we can use the
543   /// existing scalar definition.
544   ///
545   /// Return a value in the new loop corresponding to \p V from the original
546   /// loop at unroll index \p Part. If the value has already been vectorized,
547   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
548   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
549   /// a new vector value on-demand by inserting the scalar values into a vector
550   /// with an insertelement sequence. If the value has been neither vectorized
551   /// nor scalarized, it must be loop invariant, so we simply broadcast the
552   /// value into a vector.
553   Value *getOrCreateVectorValue(Value *V, unsigned Part);
554 
555   void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
556     VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
557   }
558 
559   /// Return a value in the new loop corresponding to \p V from the original
560   /// loop at unroll and vector indices \p Instance. If the value has been
561   /// vectorized but not scalarized, the necessary extractelement instruction
562   /// will be generated.
563   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
564 
565   /// Construct the vector value of a scalarized value \p V one lane at a time.
566   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
567 
568   /// Try to vectorize interleaved access group \p Group with the base address
569   /// given in \p Addr, optionally masking the vector operations if \p
570   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
571   /// values in the vectorized loop.
572   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
573                                 VPTransformState &State, VPValue *Addr,
574                                 ArrayRef<VPValue *> StoredValues,
575                                 VPValue *BlockInMask = nullptr);
576 
577   /// Vectorize Load and Store instructions with the base address given in \p
578   /// Addr, optionally masking the vector operations if \p BlockInMask is
579   /// non-null. Use \p State to translate given VPValues to IR values in the
580   /// vectorized loop.
581   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
582                                   VPValue *Def, VPValue *Addr,
583                                   VPValue *StoredValue, VPValue *BlockInMask);
584 
585   /// Set the debug location in the builder using the debug location in
586   /// the instruction.
587   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
588 
589   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
590   void fixNonInductionPHIs(void);
591 
592 protected:
593   friend class LoopVectorizationPlanner;
594 
595   /// A small list of PHINodes.
596   using PhiVector = SmallVector<PHINode *, 4>;
597 
598   /// A type for scalarized values in the new loop. Each value from the
599   /// original loop, when scalarized, is represented by UF x VF scalar values
600   /// in the new unrolled loop, where UF is the unroll factor and VF is the
601   /// vectorization factor.
602   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
603 
604   /// Set up the values of the IVs correctly when exiting the vector loop.
605   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
606                     Value *CountRoundDown, Value *EndValue,
607                     BasicBlock *MiddleBlock);
608 
609   /// Create a new induction variable inside L.
610   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
611                                    Value *Step, Instruction *DL);
612 
613   /// Handle all cross-iteration phis in the header.
614   void fixCrossIterationPHIs();
615 
616   /// Fix a first-order recurrence. This is the second phase of vectorizing
617   /// this phi node.
618   void fixFirstOrderRecurrence(PHINode *Phi);
619 
620   /// Fix a reduction cross-iteration phi. This is the second phase of
621   /// vectorizing this phi node.
622   void fixReduction(PHINode *Phi);
623 
624   /// Clear NSW/NUW flags from reduction instructions if necessary.
625   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
626 
627   /// The Loop exit block may have single value PHI nodes with some
628   /// incoming value. While vectorizing we only handled real values
629   /// that were defined inside the loop and we should have one value for
630   /// each predecessor of its parent basic block. See PR14725.
631   void fixLCSSAPHIs();
632 
633   /// Iteratively sink the scalarized operands of a predicated instruction into
634   /// the block that was created for it.
635   void sinkScalarOperands(Instruction *PredInst);
636 
637   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
638   /// represented as.
639   void truncateToMinimalBitwidths();
640 
641   /// Create a broadcast instruction. This method generates a broadcast
642   /// instruction (shuffle) for loop invariant values and for the induction
643   /// value. If this is the induction variable then we extend it to N, N+1, ...
644   /// this is needed because each iteration in the loop corresponds to a SIMD
645   /// element.
646   virtual Value *getBroadcastInstrs(Value *V);
647 
648   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
649   /// to each vector element of Val. The sequence starts at StartIndex.
650   /// \p Opcode is relevant for FP induction variable.
651   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
652                                Instruction::BinaryOps Opcode =
653                                Instruction::BinaryOpsEnd);
654 
655   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
656   /// variable on which to base the steps, \p Step is the size of the step, and
657   /// \p EntryVal is the value from the original loop that maps to the steps.
658   /// Note that \p EntryVal doesn't have to be an induction variable - it
659   /// can also be a truncate instruction.
660   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
661                         const InductionDescriptor &ID);
662 
663   /// Create a vector induction phi node based on an existing scalar one. \p
664   /// EntryVal is the value from the original loop that maps to the vector phi
665   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
666   /// truncate instruction, instead of widening the original IV, we widen a
667   /// version of the IV truncated to \p EntryVal's type.
668   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
669                                        Value *Step, Instruction *EntryVal);
670 
671   /// Returns true if an instruction \p I should be scalarized instead of
672   /// vectorized for the chosen vectorization factor.
673   bool shouldScalarizeInstruction(Instruction *I) const;
674 
675   /// Returns true if we should generate a scalar version of \p IV.
676   bool needsScalarInduction(Instruction *IV) const;
677 
678   /// If there is a cast involved in the induction variable \p ID, which should
679   /// be ignored in the vectorized loop body, this function records the
680   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
681   /// cast. We had already proved that the casted Phi is equal to the uncasted
682   /// Phi in the vectorized loop (under a runtime guard), and therefore
683   /// there is no need to vectorize the cast - the same value can be used in the
684   /// vector loop for both the Phi and the cast.
685   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
686   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
687   ///
688   /// \p EntryVal is the value from the original loop that maps to the vector
689   /// phi node and is used to distinguish what is the IV currently being
690   /// processed - original one (if \p EntryVal is a phi corresponding to the
691   /// original IV) or the "newly-created" one based on the proof mentioned above
692   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
693   /// latter case \p EntryVal is a TruncInst and we must not record anything for
694   /// that IV, but it's error-prone to expect callers of this routine to care
695   /// about that, hence this explicit parameter.
696   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
697                                              const Instruction *EntryVal,
698                                              Value *VectorLoopValue,
699                                              unsigned Part,
700                                              unsigned Lane = UINT_MAX);
701 
702   /// Generate a shuffle sequence that will reverse the vector Vec.
703   virtual Value *reverseVector(Value *Vec);
704 
705   /// Returns (and creates if needed) the original loop trip count.
706   Value *getOrCreateTripCount(Loop *NewLoop);
707 
708   /// Returns (and creates if needed) the trip count of the widened loop.
709   Value *getOrCreateVectorTripCount(Loop *NewLoop);
710 
711   /// Returns a bitcasted value to the requested vector type.
712   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
713   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
714                                 const DataLayout &DL);
715 
716   /// Emit a bypass check to see if the vector trip count is zero, including if
717   /// it overflows.
718   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
719 
720   /// Emit a bypass check to see if all of the SCEV assumptions we've
721   /// had to make are correct.
722   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
723 
724   /// Emit bypass checks to check any memory assumptions we may have made.
725   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
726 
727   /// Compute the transformed value of Index at offset StartValue using step
728   /// StepValue.
729   /// For integer induction, returns StartValue + Index * StepValue.
730   /// For pointer induction, returns StartValue[Index * StepValue].
731   /// FIXME: The newly created binary instructions should contain nsw/nuw
732   /// flags, which can be found from the original scalar operations.
733   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
734                               const DataLayout &DL,
735                               const InductionDescriptor &ID) const;
736 
737   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
738   /// vector loop preheader, middle block and scalar preheader. Also
739   /// allocate a loop object for the new vector loop and return it.
740   Loop *createVectorLoopSkeleton(StringRef Prefix);
741 
742   /// Create new phi nodes for the induction variables to resume iteration count
743   /// in the scalar epilogue, from where the vectorized loop left off (given by
744   /// \p VectorTripCount).
745   /// In cases where the loop skeleton is more complicated (eg. epilogue
746   /// vectorization) and the resume values can come from an additional bypass
747   /// block, the \p AdditionalBypass pair provides information about the bypass
748   /// block and the end value on the edge from bypass to this loop.
749   void createInductionResumeValues(
750       Loop *L, Value *VectorTripCount,
751       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
752 
753   /// Complete the loop skeleton by adding debug MDs, creating appropriate
754   /// conditional branches in the middle block, preparing the builder and
755   /// running the verifier. Take in the vector loop \p L as argument, and return
756   /// the preheader of the completed vector loop.
757   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
758 
759   /// Add additional metadata to \p To that was not present on \p Orig.
760   ///
761   /// Currently this is used to add the noalias annotations based on the
762   /// inserted memchecks.  Use this for instructions that are *cloned* into the
763   /// vector loop.
764   void addNewMetadata(Instruction *To, const Instruction *Orig);
765 
766   /// Add metadata from one instruction to another.
767   ///
768   /// This includes both the original MDs from \p From and additional ones (\see
769   /// addNewMetadata).  Use this for *newly created* instructions in the vector
770   /// loop.
771   void addMetadata(Instruction *To, Instruction *From);
772 
773   /// Similar to the previous function but it adds the metadata to a
774   /// vector of instructions.
775   void addMetadata(ArrayRef<Value *> To, Instruction *From);
776 
777   /// Allow subclasses to override and print debug traces before/after vplan
778   /// execution, when trace information is requested.
779   virtual void printDebugTracesAtStart(){};
780   virtual void printDebugTracesAtEnd(){};
781 
782   /// The original loop.
783   Loop *OrigLoop;
784 
785   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
786   /// dynamic knowledge to simplify SCEV expressions and converts them to a
787   /// more usable form.
788   PredicatedScalarEvolution &PSE;
789 
790   /// Loop Info.
791   LoopInfo *LI;
792 
793   /// Dominator Tree.
794   DominatorTree *DT;
795 
796   /// Alias Analysis.
797   AAResults *AA;
798 
799   /// Target Library Info.
800   const TargetLibraryInfo *TLI;
801 
802   /// Target Transform Info.
803   const TargetTransformInfo *TTI;
804 
805   /// Assumption Cache.
806   AssumptionCache *AC;
807 
808   /// Interface to emit optimization remarks.
809   OptimizationRemarkEmitter *ORE;
810 
811   /// LoopVersioning.  It's only set up (non-null) if memchecks were
812   /// used.
813   ///
814   /// This is currently only used to add no-alias metadata based on the
815   /// memchecks.  The actually versioning is performed manually.
816   std::unique_ptr<LoopVersioning> LVer;
817 
818   /// The vectorization SIMD factor to use. Each vector will have this many
819   /// vector elements.
820   ElementCount VF;
821 
822   /// The vectorization unroll factor to use. Each scalar is vectorized to this
823   /// many different vector instructions.
824   unsigned UF;
825 
826   /// The builder that we use
827   IRBuilder<> Builder;
828 
829   // --- Vectorization state ---
830 
831   /// The vector-loop preheader.
832   BasicBlock *LoopVectorPreHeader;
833 
834   /// The scalar-loop preheader.
835   BasicBlock *LoopScalarPreHeader;
836 
837   /// Middle Block between the vector and the scalar.
838   BasicBlock *LoopMiddleBlock;
839 
840   /// The ExitBlock of the scalar loop.
841   BasicBlock *LoopExitBlock;
842 
843   /// The vector loop body.
844   BasicBlock *LoopVectorBody;
845 
846   /// The scalar loop body.
847   BasicBlock *LoopScalarBody;
848 
849   /// A list of all bypass blocks. The first block is the entry of the loop.
850   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
851 
852   /// The new Induction variable which was added to the new block.
853   PHINode *Induction = nullptr;
854 
855   /// The induction variable of the old basic block.
856   PHINode *OldInduction = nullptr;
857 
858   /// Maps values from the original loop to their corresponding values in the
859   /// vectorized loop. A key value can map to either vector values, scalar
860   /// values or both kinds of values, depending on whether the key was
861   /// vectorized and scalarized.
862   VectorizerValueMap VectorLoopValueMap;
863 
864   /// Store instructions that were predicated.
865   SmallVector<Instruction *, 4> PredicatedInstructions;
866 
867   /// Trip count of the original loop.
868   Value *TripCount = nullptr;
869 
870   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
871   Value *VectorTripCount = nullptr;
872 
873   /// The legality analysis.
874   LoopVectorizationLegality *Legal;
875 
876   /// The profitablity analysis.
877   LoopVectorizationCostModel *Cost;
878 
879   // Record whether runtime checks are added.
880   bool AddedSafetyChecks = false;
881 
882   // Holds the end values for each induction variable. We save the end values
883   // so we can later fix-up the external users of the induction variables.
884   DenseMap<PHINode *, Value *> IVEndValues;
885 
886   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
887   // fixed up at the end of vector code generation.
888   SmallVector<PHINode *, 8> OrigPHIsToFix;
889 
890   /// BFI and PSI are used to check for profile guided size optimizations.
891   BlockFrequencyInfo *BFI;
892   ProfileSummaryInfo *PSI;
893 
894   // Whether this loop should be optimized for size based on profile guided size
895   // optimizatios.
896   bool OptForSizeBasedOnProfile;
897 };
898 
899 class InnerLoopUnroller : public InnerLoopVectorizer {
900 public:
901   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
902                     LoopInfo *LI, DominatorTree *DT,
903                     const TargetLibraryInfo *TLI,
904                     const TargetTransformInfo *TTI, AssumptionCache *AC,
905                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
906                     LoopVectorizationLegality *LVL,
907                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
908                     ProfileSummaryInfo *PSI)
909       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
910                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
911                             BFI, PSI) {}
912 
913 private:
914   Value *getBroadcastInstrs(Value *V) override;
915   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
916                        Instruction::BinaryOps Opcode =
917                        Instruction::BinaryOpsEnd) override;
918   Value *reverseVector(Value *Vec) override;
919 };
920 
921 /// Encapsulate information regarding vectorization of a loop and its epilogue.
922 /// This information is meant to be updated and used across two stages of
923 /// epilogue vectorization.
924 struct EpilogueLoopVectorizationInfo {
925   ElementCount MainLoopVF = ElementCount::getFixed(0);
926   unsigned MainLoopUF = 0;
927   ElementCount EpilogueVF = ElementCount::getFixed(0);
928   unsigned EpilogueUF = 0;
929   BasicBlock *MainLoopIterationCountCheck = nullptr;
930   BasicBlock *EpilogueIterationCountCheck = nullptr;
931   BasicBlock *SCEVSafetyCheck = nullptr;
932   BasicBlock *MemSafetyCheck = nullptr;
933   Value *TripCount = nullptr;
934   Value *VectorTripCount = nullptr;
935 
936   EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
937                                 unsigned EUF)
938       : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
939         EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
940     assert(EUF == 1 &&
941            "A high UF for the epilogue loop is likely not beneficial.");
942   }
943 };
944 
945 /// An extension of the inner loop vectorizer that creates a skeleton for a
946 /// vectorized loop that has its epilogue (residual) also vectorized.
947 /// The idea is to run the vplan on a given loop twice, firstly to setup the
948 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
949 /// from the first step and vectorize the epilogue.  This is achieved by
950 /// deriving two concrete strategy classes from this base class and invoking
951 /// them in succession from the loop vectorizer planner.
952 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
953 public:
954   InnerLoopAndEpilogueVectorizer(
955       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
956       DominatorTree *DT, const TargetLibraryInfo *TLI,
957       const TargetTransformInfo *TTI, AssumptionCache *AC,
958       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
959       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
960       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
961       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
962                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
963         EPI(EPI) {}
964 
965   // Override this function to handle the more complex control flow around the
966   // three loops.
967   BasicBlock *createVectorizedLoopSkeleton() final override {
968     return createEpilogueVectorizedLoopSkeleton();
969   }
970 
971   /// The interface for creating a vectorized skeleton using one of two
972   /// different strategies, each corresponding to one execution of the vplan
973   /// as described above.
974   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
975 
976   /// Holds and updates state information required to vectorize the main loop
977   /// and its epilogue in two separate passes. This setup helps us avoid
978   /// regenerating and recomputing runtime safety checks. It also helps us to
979   /// shorten the iteration-count-check path length for the cases where the
980   /// iteration count of the loop is so small that the main vector loop is
981   /// completely skipped.
982   EpilogueLoopVectorizationInfo &EPI;
983 };
984 
985 /// A specialized derived class of inner loop vectorizer that performs
986 /// vectorization of *main* loops in the process of vectorizing loops and their
987 /// epilogues.
988 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
989 public:
990   EpilogueVectorizerMainLoop(
991       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
992       DominatorTree *DT, const TargetLibraryInfo *TLI,
993       const TargetTransformInfo *TTI, AssumptionCache *AC,
994       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
995       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
996       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
997       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
998                                        EPI, LVL, CM, BFI, PSI) {}
999   /// Implements the interface for creating a vectorized skeleton using the
1000   /// *main loop* strategy (ie the first pass of vplan execution).
1001   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1002 
1003 protected:
1004   /// Emits an iteration count bypass check once for the main loop (when \p
1005   /// ForEpilogue is false) and once for the epilogue loop (when \p
1006   /// ForEpilogue is true).
1007   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
1008                                              bool ForEpilogue);
1009   void printDebugTracesAtStart() override;
1010   void printDebugTracesAtEnd() override;
1011 };
1012 
1013 // A specialized derived class of inner loop vectorizer that performs
1014 // vectorization of *epilogue* loops in the process of vectorizing loops and
1015 // their epilogues.
1016 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
1017 public:
1018   EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
1019                     LoopInfo *LI, DominatorTree *DT,
1020                     const TargetLibraryInfo *TLI,
1021                     const TargetTransformInfo *TTI, AssumptionCache *AC,
1022                     OptimizationRemarkEmitter *ORE,
1023                     EpilogueLoopVectorizationInfo &EPI,
1024                     LoopVectorizationLegality *LVL,
1025                     llvm::LoopVectorizationCostModel *CM,
1026                     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
1027       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1028                                        EPI, LVL, CM, BFI, PSI) {}
1029   /// Implements the interface for creating a vectorized skeleton using the
1030   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1031   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1032 
1033 protected:
1034   /// Emits an iteration count bypass check after the main vector loop has
1035   /// finished to see if there are any iterations left to execute by either
1036   /// the vector epilogue or the scalar epilogue.
1037   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1038                                                       BasicBlock *Bypass,
1039                                                       BasicBlock *Insert);
1040   void printDebugTracesAtStart() override;
1041   void printDebugTracesAtEnd() override;
1042 };
1043 } // end namespace llvm
1044 
1045 /// Look for a meaningful debug location on the instruction or it's
1046 /// operands.
1047 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1048   if (!I)
1049     return I;
1050 
1051   DebugLoc Empty;
1052   if (I->getDebugLoc() != Empty)
1053     return I;
1054 
1055   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
1056     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
1057       if (OpInst->getDebugLoc() != Empty)
1058         return OpInst;
1059   }
1060 
1061   return I;
1062 }
1063 
1064 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1065   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1066     const DILocation *DIL = Inst->getDebugLoc();
1067     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1068         !isa<DbgInfoIntrinsic>(Inst)) {
1069       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1070       auto NewDIL =
1071           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1072       if (NewDIL)
1073         B.SetCurrentDebugLocation(NewDIL.getValue());
1074       else
1075         LLVM_DEBUG(dbgs()
1076                    << "Failed to create new discriminator: "
1077                    << DIL->getFilename() << " Line: " << DIL->getLine());
1078     }
1079     else
1080       B.SetCurrentDebugLocation(DIL);
1081   } else
1082     B.SetCurrentDebugLocation(DebugLoc());
1083 }
1084 
1085 /// Write a record \p DebugMsg about vectorization failure to the debug
1086 /// output stream. If \p I is passed, it is an instruction that prevents
1087 /// vectorization.
1088 #ifndef NDEBUG
1089 static void debugVectorizationFailure(const StringRef DebugMsg,
1090     Instruction *I) {
1091   dbgs() << "LV: Not vectorizing: " << DebugMsg;
1092   if (I != nullptr)
1093     dbgs() << " " << *I;
1094   else
1095     dbgs() << '.';
1096   dbgs() << '\n';
1097 }
1098 #endif
1099 
1100 /// Create an analysis remark that explains why vectorization failed
1101 ///
1102 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1103 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1104 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1105 /// the location of the remark.  \return the remark object that can be
1106 /// streamed to.
1107 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1108     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1109   Value *CodeRegion = TheLoop->getHeader();
1110   DebugLoc DL = TheLoop->getStartLoc();
1111 
1112   if (I) {
1113     CodeRegion = I->getParent();
1114     // If there is no debug location attached to the instruction, revert back to
1115     // using the loop's.
1116     if (I->getDebugLoc())
1117       DL = I->getDebugLoc();
1118   }
1119 
1120   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1121   R << "loop not vectorized: ";
1122   return R;
1123 }
1124 
1125 namespace llvm {
1126 
1127 void reportVectorizationFailure(const StringRef DebugMsg,
1128     const StringRef OREMsg, const StringRef ORETag,
1129     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1130   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
1131   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1132   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1133                 ORETag, TheLoop, I) << OREMsg);
1134 }
1135 
1136 } // end namespace llvm
1137 
1138 #ifndef NDEBUG
1139 /// \return string containing a file name and a line # for the given loop.
1140 static std::string getDebugLocString(const Loop *L) {
1141   std::string Result;
1142   if (L) {
1143     raw_string_ostream OS(Result);
1144     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1145       LoopDbgLoc.print(OS);
1146     else
1147       // Just print the module name.
1148       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1149     OS.flush();
1150   }
1151   return Result;
1152 }
1153 #endif
1154 
1155 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1156                                          const Instruction *Orig) {
1157   // If the loop was versioned with memchecks, add the corresponding no-alias
1158   // metadata.
1159   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1160     LVer->annotateInstWithNoAlias(To, Orig);
1161 }
1162 
1163 void InnerLoopVectorizer::addMetadata(Instruction *To,
1164                                       Instruction *From) {
1165   propagateMetadata(To, From);
1166   addNewMetadata(To, From);
1167 }
1168 
1169 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1170                                       Instruction *From) {
1171   for (Value *V : To) {
1172     if (Instruction *I = dyn_cast<Instruction>(V))
1173       addMetadata(I, From);
1174   }
1175 }
1176 
1177 namespace llvm {
1178 
1179 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1180 // lowered.
1181 enum ScalarEpilogueLowering {
1182 
1183   // The default: allowing scalar epilogues.
1184   CM_ScalarEpilogueAllowed,
1185 
1186   // Vectorization with OptForSize: don't allow epilogues.
1187   CM_ScalarEpilogueNotAllowedOptSize,
1188 
1189   // A special case of vectorisation with OptForSize: loops with a very small
1190   // trip count are considered for vectorization under OptForSize, thereby
1191   // making sure the cost of their loop body is dominant, free of runtime
1192   // guards and scalar iteration overheads.
1193   CM_ScalarEpilogueNotAllowedLowTripLoop,
1194 
1195   // Loop hint predicate indicating an epilogue is undesired.
1196   CM_ScalarEpilogueNotNeededUsePredicate
1197 };
1198 
1199 /// LoopVectorizationCostModel - estimates the expected speedups due to
1200 /// vectorization.
1201 /// In many cases vectorization is not profitable. This can happen because of
1202 /// a number of reasons. In this class we mainly attempt to predict the
1203 /// expected speedup/slowdowns due to the supported instruction set. We use the
1204 /// TargetTransformInfo to query the different backends for the cost of
1205 /// different operations.
1206 class LoopVectorizationCostModel {
1207 public:
1208   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1209                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1210                              LoopVectorizationLegality *Legal,
1211                              const TargetTransformInfo &TTI,
1212                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1213                              AssumptionCache *AC,
1214                              OptimizationRemarkEmitter *ORE, const Function *F,
1215                              const LoopVectorizeHints *Hints,
1216                              InterleavedAccessInfo &IAI)
1217       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1218         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1219         Hints(Hints), InterleaveInfo(IAI) {}
1220 
1221   /// \return An upper bound for the vectorization factor, or None if
1222   /// vectorization and interleaving should be avoided up front.
1223   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1224 
1225   /// \return True if runtime checks are required for vectorization, and false
1226   /// otherwise.
1227   bool runtimeChecksRequired();
1228 
1229   /// \return The most profitable vectorization factor and the cost of that VF.
1230   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1231   /// then this vectorization factor will be selected if vectorization is
1232   /// possible.
1233   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1234   VectorizationFactor
1235   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1236                                     const LoopVectorizationPlanner &LVP);
1237 
1238   /// Setup cost-based decisions for user vectorization factor.
1239   void selectUserVectorizationFactor(ElementCount UserVF) {
1240     collectUniformsAndScalars(UserVF);
1241     collectInstsToScalarize(UserVF);
1242   }
1243 
1244   /// \return The size (in bits) of the smallest and widest types in the code
1245   /// that needs to be vectorized. We ignore values that remain scalar such as
1246   /// 64 bit loop indices.
1247   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1248 
1249   /// \return The desired interleave count.
1250   /// If interleave count has been specified by metadata it will be returned.
1251   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1252   /// are the selected vectorization factor and the cost of the selected VF.
1253   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1254 
1255   /// Memory access instruction may be vectorized in more than one way.
1256   /// Form of instruction after vectorization depends on cost.
1257   /// This function takes cost-based decisions for Load/Store instructions
1258   /// and collects them in a map. This decisions map is used for building
1259   /// the lists of loop-uniform and loop-scalar instructions.
1260   /// The calculated cost is saved with widening decision in order to
1261   /// avoid redundant calculations.
1262   void setCostBasedWideningDecision(ElementCount VF);
1263 
1264   /// A struct that represents some properties of the register usage
1265   /// of a loop.
1266   struct RegisterUsage {
1267     /// Holds the number of loop invariant values that are used in the loop.
1268     /// The key is ClassID of target-provided register class.
1269     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1270     /// Holds the maximum number of concurrent live intervals in the loop.
1271     /// The key is ClassID of target-provided register class.
1272     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1273   };
1274 
1275   /// \return Returns information about the register usages of the loop for the
1276   /// given vectorization factors.
1277   SmallVector<RegisterUsage, 8>
1278   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1279 
1280   /// Collect values we want to ignore in the cost model.
1281   void collectValuesToIgnore();
1282 
1283   /// Split reductions into those that happen in the loop, and those that happen
1284   /// outside. In loop reductions are collected into InLoopReductionChains.
1285   void collectInLoopReductions();
1286 
1287   /// \returns The smallest bitwidth each instruction can be represented with.
1288   /// The vector equivalents of these instructions should be truncated to this
1289   /// type.
1290   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1291     return MinBWs;
1292   }
1293 
1294   /// \returns True if it is more profitable to scalarize instruction \p I for
1295   /// vectorization factor \p VF.
1296   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1297     assert(VF.isVector() &&
1298            "Profitable to scalarize relevant only for VF > 1.");
1299 
1300     // Cost model is not run in the VPlan-native path - return conservative
1301     // result until this changes.
1302     if (EnableVPlanNativePath)
1303       return false;
1304 
1305     auto Scalars = InstsToScalarize.find(VF);
1306     assert(Scalars != InstsToScalarize.end() &&
1307            "VF not yet analyzed for scalarization profitability");
1308     return Scalars->second.find(I) != Scalars->second.end();
1309   }
1310 
1311   /// Returns true if \p I is known to be uniform after vectorization.
1312   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1313     if (VF.isScalar())
1314       return true;
1315 
1316     // Cost model is not run in the VPlan-native path - return conservative
1317     // result until this changes.
1318     if (EnableVPlanNativePath)
1319       return false;
1320 
1321     auto UniformsPerVF = Uniforms.find(VF);
1322     assert(UniformsPerVF != Uniforms.end() &&
1323            "VF not yet analyzed for uniformity");
1324     return UniformsPerVF->second.count(I);
1325   }
1326 
1327   /// Returns true if \p I is known to be scalar after vectorization.
1328   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1329     if (VF.isScalar())
1330       return true;
1331 
1332     // Cost model is not run in the VPlan-native path - return conservative
1333     // result until this changes.
1334     if (EnableVPlanNativePath)
1335       return false;
1336 
1337     auto ScalarsPerVF = Scalars.find(VF);
1338     assert(ScalarsPerVF != Scalars.end() &&
1339            "Scalar values are not calculated for VF");
1340     return ScalarsPerVF->second.count(I);
1341   }
1342 
1343   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1344   /// for vectorization factor \p VF.
1345   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1346     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1347            !isProfitableToScalarize(I, VF) &&
1348            !isScalarAfterVectorization(I, VF);
1349   }
1350 
1351   /// Decision that was taken during cost calculation for memory instruction.
1352   enum InstWidening {
1353     CM_Unknown,
1354     CM_Widen,         // For consecutive accesses with stride +1.
1355     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1356     CM_Interleave,
1357     CM_GatherScatter,
1358     CM_Scalarize
1359   };
1360 
1361   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1362   /// instruction \p I and vector width \p VF.
1363   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1364                            unsigned Cost) {
1365     assert(VF.isVector() && "Expected VF >=2");
1366     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1367   }
1368 
1369   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1370   /// interleaving group \p Grp and vector width \p VF.
1371   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1372                            ElementCount VF, InstWidening W, unsigned Cost) {
1373     assert(VF.isVector() && "Expected VF >=2");
1374     /// Broadcast this decicion to all instructions inside the group.
1375     /// But the cost will be assigned to one instruction only.
1376     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1377       if (auto *I = Grp->getMember(i)) {
1378         if (Grp->getInsertPos() == I)
1379           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1380         else
1381           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1382       }
1383     }
1384   }
1385 
1386   /// Return the cost model decision for the given instruction \p I and vector
1387   /// width \p VF. Return CM_Unknown if this instruction did not pass
1388   /// through the cost modeling.
1389   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1390     assert(!VF.isScalable() && "scalable vectors not yet supported.");
1391     assert(VF.isVector() && "Expected VF >=2");
1392 
1393     // Cost model is not run in the VPlan-native path - return conservative
1394     // result until this changes.
1395     if (EnableVPlanNativePath)
1396       return CM_GatherScatter;
1397 
1398     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1399     auto Itr = WideningDecisions.find(InstOnVF);
1400     if (Itr == WideningDecisions.end())
1401       return CM_Unknown;
1402     return Itr->second.first;
1403   }
1404 
1405   /// Return the vectorization cost for the given instruction \p I and vector
1406   /// width \p VF.
1407   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1408     assert(VF.isVector() && "Expected VF >=2");
1409     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1410     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1411            "The cost is not calculated");
1412     return WideningDecisions[InstOnVF].second;
1413   }
1414 
1415   /// Return True if instruction \p I is an optimizable truncate whose operand
1416   /// is an induction variable. Such a truncate will be removed by adding a new
1417   /// induction variable with the destination type.
1418   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1419     // If the instruction is not a truncate, return false.
1420     auto *Trunc = dyn_cast<TruncInst>(I);
1421     if (!Trunc)
1422       return false;
1423 
1424     // Get the source and destination types of the truncate.
1425     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1426     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1427 
1428     // If the truncate is free for the given types, return false. Replacing a
1429     // free truncate with an induction variable would add an induction variable
1430     // update instruction to each iteration of the loop. We exclude from this
1431     // check the primary induction variable since it will need an update
1432     // instruction regardless.
1433     Value *Op = Trunc->getOperand(0);
1434     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1435       return false;
1436 
1437     // If the truncated value is not an induction variable, return false.
1438     return Legal->isInductionPhi(Op);
1439   }
1440 
1441   /// Collects the instructions to scalarize for each predicated instruction in
1442   /// the loop.
1443   void collectInstsToScalarize(ElementCount VF);
1444 
1445   /// Collect Uniform and Scalar values for the given \p VF.
1446   /// The sets depend on CM decision for Load/Store instructions
1447   /// that may be vectorized as interleave, gather-scatter or scalarized.
1448   void collectUniformsAndScalars(ElementCount VF) {
1449     // Do the analysis once.
1450     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1451       return;
1452     setCostBasedWideningDecision(VF);
1453     collectLoopUniforms(VF);
1454     collectLoopScalars(VF);
1455   }
1456 
1457   /// Returns true if the target machine supports masked store operation
1458   /// for the given \p DataType and kind of access to \p Ptr.
1459   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1460     return Legal->isConsecutivePtr(Ptr) &&
1461            TTI.isLegalMaskedStore(DataType, Alignment);
1462   }
1463 
1464   /// Returns true if the target machine supports masked load operation
1465   /// for the given \p DataType and kind of access to \p Ptr.
1466   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1467     return Legal->isConsecutivePtr(Ptr) &&
1468            TTI.isLegalMaskedLoad(DataType, Alignment);
1469   }
1470 
1471   /// Returns true if the target machine supports masked scatter operation
1472   /// for the given \p DataType.
1473   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1474     return TTI.isLegalMaskedScatter(DataType, Alignment);
1475   }
1476 
1477   /// Returns true if the target machine supports masked gather operation
1478   /// for the given \p DataType.
1479   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1480     return TTI.isLegalMaskedGather(DataType, Alignment);
1481   }
1482 
1483   /// Returns true if the target machine can represent \p V as a masked gather
1484   /// or scatter operation.
1485   bool isLegalGatherOrScatter(Value *V) {
1486     bool LI = isa<LoadInst>(V);
1487     bool SI = isa<StoreInst>(V);
1488     if (!LI && !SI)
1489       return false;
1490     auto *Ty = getMemInstValueType(V);
1491     Align Align = getLoadStoreAlignment(V);
1492     return (LI && isLegalMaskedGather(Ty, Align)) ||
1493            (SI && isLegalMaskedScatter(Ty, Align));
1494   }
1495 
1496   /// Returns true if \p I is an instruction that will be scalarized with
1497   /// predication. Such instructions include conditional stores and
1498   /// instructions that may divide by zero.
1499   /// If a non-zero VF has been calculated, we check if I will be scalarized
1500   /// predication for that VF.
1501   bool isScalarWithPredication(Instruction *I,
1502                                ElementCount VF = ElementCount::getFixed(1));
1503 
1504   // Returns true if \p I is an instruction that will be predicated either
1505   // through scalar predication or masked load/store or masked gather/scatter.
1506   // Superset of instructions that return true for isScalarWithPredication.
1507   bool isPredicatedInst(Instruction *I) {
1508     if (!blockNeedsPredication(I->getParent()))
1509       return false;
1510     // Loads and stores that need some form of masked operation are predicated
1511     // instructions.
1512     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1513       return Legal->isMaskRequired(I);
1514     return isScalarWithPredication(I);
1515   }
1516 
1517   /// Returns true if \p I is a memory instruction with consecutive memory
1518   /// access that can be widened.
1519   bool
1520   memoryInstructionCanBeWidened(Instruction *I,
1521                                 ElementCount VF = ElementCount::getFixed(1));
1522 
1523   /// Returns true if \p I is a memory instruction in an interleaved-group
1524   /// of memory accesses that can be vectorized with wide vector loads/stores
1525   /// and shuffles.
1526   bool
1527   interleavedAccessCanBeWidened(Instruction *I,
1528                                 ElementCount VF = ElementCount::getFixed(1));
1529 
1530   /// Check if \p Instr belongs to any interleaved access group.
1531   bool isAccessInterleaved(Instruction *Instr) {
1532     return InterleaveInfo.isInterleaved(Instr);
1533   }
1534 
1535   /// Get the interleaved access group that \p Instr belongs to.
1536   const InterleaveGroup<Instruction> *
1537   getInterleavedAccessGroup(Instruction *Instr) {
1538     return InterleaveInfo.getInterleaveGroup(Instr);
1539   }
1540 
1541   /// Returns true if an interleaved group requires a scalar iteration
1542   /// to handle accesses with gaps, and there is nothing preventing us from
1543   /// creating a scalar epilogue.
1544   bool requiresScalarEpilogue() const {
1545     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1546   }
1547 
1548   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1549   /// loop hint annotation.
1550   bool isScalarEpilogueAllowed() const {
1551     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1552   }
1553 
1554   /// Returns true if all loop blocks should be masked to fold tail loop.
1555   bool foldTailByMasking() const { return FoldTailByMasking; }
1556 
1557   bool blockNeedsPredication(BasicBlock *BB) {
1558     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1559   }
1560 
1561   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1562   /// nodes to the chain of instructions representing the reductions. Uses a
1563   /// MapVector to ensure deterministic iteration order.
1564   using ReductionChainMap =
1565       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1566 
1567   /// Return the chain of instructions representing an inloop reduction.
1568   const ReductionChainMap &getInLoopReductionChains() const {
1569     return InLoopReductionChains;
1570   }
1571 
1572   /// Returns true if the Phi is part of an inloop reduction.
1573   bool isInLoopReduction(PHINode *Phi) const {
1574     return InLoopReductionChains.count(Phi);
1575   }
1576 
1577   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1578   /// with factor VF.  Return the cost of the instruction, including
1579   /// scalarization overhead if it's needed.
1580   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1581 
1582   /// Estimate cost of a call instruction CI if it were vectorized with factor
1583   /// VF. Return the cost of the instruction, including scalarization overhead
1584   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1585   /// scalarized -
1586   /// i.e. either vector version isn't available, or is too expensive.
1587   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1588                              bool &NeedToScalarize);
1589 
1590   /// Invalidates decisions already taken by the cost model.
1591   void invalidateCostModelingDecisions() {
1592     WideningDecisions.clear();
1593     Uniforms.clear();
1594     Scalars.clear();
1595   }
1596 
1597 private:
1598   unsigned NumPredStores = 0;
1599 
1600   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1601   /// than zero. One is returned if vectorization should best be avoided due
1602   /// to cost.
1603   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1604                                     ElementCount UserVF);
1605 
1606   /// The vectorization cost is a combination of the cost itself and a boolean
1607   /// indicating whether any of the contributing operations will actually
1608   /// operate on
1609   /// vector values after type legalization in the backend. If this latter value
1610   /// is
1611   /// false, then all operations will be scalarized (i.e. no vectorization has
1612   /// actually taken place).
1613   using VectorizationCostTy = std::pair<unsigned, bool>;
1614 
1615   /// Returns the expected execution cost. The unit of the cost does
1616   /// not matter because we use the 'cost' units to compare different
1617   /// vector widths. The cost that is returned is *not* normalized by
1618   /// the factor width.
1619   VectorizationCostTy expectedCost(ElementCount VF);
1620 
1621   /// Returns the execution time cost of an instruction for a given vector
1622   /// width. Vector width of one means scalar.
1623   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1624 
1625   /// The cost-computation logic from getInstructionCost which provides
1626   /// the vector type as an output parameter.
1627   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1628 
1629   /// Calculate vectorization cost of memory instruction \p I.
1630   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1631 
1632   /// The cost computation for scalarized memory instruction.
1633   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1634 
1635   /// The cost computation for interleaving group of memory instructions.
1636   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1637 
1638   /// The cost computation for Gather/Scatter instruction.
1639   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1640 
1641   /// The cost computation for widening instruction \p I with consecutive
1642   /// memory access.
1643   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1644 
1645   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1646   /// Load: scalar load + broadcast.
1647   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1648   /// element)
1649   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1650 
1651   /// Estimate the overhead of scalarizing an instruction. This is a
1652   /// convenience wrapper for the type-based getScalarizationOverhead API.
1653   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1654 
1655   /// Returns whether the instruction is a load or store and will be a emitted
1656   /// as a vector operation.
1657   bool isConsecutiveLoadOrStore(Instruction *I);
1658 
1659   /// Returns true if an artificially high cost for emulated masked memrefs
1660   /// should be used.
1661   bool useEmulatedMaskMemRefHack(Instruction *I);
1662 
1663   /// Map of scalar integer values to the smallest bitwidth they can be legally
1664   /// represented as. The vector equivalents of these values should be truncated
1665   /// to this type.
1666   MapVector<Instruction *, uint64_t> MinBWs;
1667 
1668   /// A type representing the costs for instructions if they were to be
1669   /// scalarized rather than vectorized. The entries are Instruction-Cost
1670   /// pairs.
1671   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1672 
1673   /// A set containing all BasicBlocks that are known to present after
1674   /// vectorization as a predicated block.
1675   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1676 
1677   /// Records whether it is allowed to have the original scalar loop execute at
1678   /// least once. This may be needed as a fallback loop in case runtime
1679   /// aliasing/dependence checks fail, or to handle the tail/remainder
1680   /// iterations when the trip count is unknown or doesn't divide by the VF,
1681   /// or as a peel-loop to handle gaps in interleave-groups.
1682   /// Under optsize and when the trip count is very small we don't allow any
1683   /// iterations to execute in the scalar loop.
1684   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1685 
1686   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1687   bool FoldTailByMasking = false;
1688 
1689   /// A map holding scalar costs for different vectorization factors. The
1690   /// presence of a cost for an instruction in the mapping indicates that the
1691   /// instruction will be scalarized when vectorizing with the associated
1692   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1693   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1694 
1695   /// Holds the instructions known to be uniform after vectorization.
1696   /// The data is collected per VF.
1697   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1698 
1699   /// Holds the instructions known to be scalar after vectorization.
1700   /// The data is collected per VF.
1701   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1702 
1703   /// Holds the instructions (address computations) that are forced to be
1704   /// scalarized.
1705   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1706 
1707   /// PHINodes of the reductions that should be expanded in-loop along with
1708   /// their associated chains of reduction operations, in program order from top
1709   /// (PHI) to bottom
1710   ReductionChainMap InLoopReductionChains;
1711 
1712   /// Returns the expected difference in cost from scalarizing the expression
1713   /// feeding a predicated instruction \p PredInst. The instructions to
1714   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1715   /// non-negative return value implies the expression will be scalarized.
1716   /// Currently, only single-use chains are considered for scalarization.
1717   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1718                               ElementCount VF);
1719 
1720   /// Collect the instructions that are uniform after vectorization. An
1721   /// instruction is uniform if we represent it with a single scalar value in
1722   /// the vectorized loop corresponding to each vector iteration. Examples of
1723   /// uniform instructions include pointer operands of consecutive or
1724   /// interleaved memory accesses. Note that although uniformity implies an
1725   /// instruction will be scalar, the reverse is not true. In general, a
1726   /// scalarized instruction will be represented by VF scalar values in the
1727   /// vectorized loop, each corresponding to an iteration of the original
1728   /// scalar loop.
1729   void collectLoopUniforms(ElementCount VF);
1730 
1731   /// Collect the instructions that are scalar after vectorization. An
1732   /// instruction is scalar if it is known to be uniform or will be scalarized
1733   /// during vectorization. Non-uniform scalarized instructions will be
1734   /// represented by VF values in the vectorized loop, each corresponding to an
1735   /// iteration of the original scalar loop.
1736   void collectLoopScalars(ElementCount VF);
1737 
1738   /// Keeps cost model vectorization decision and cost for instructions.
1739   /// Right now it is used for memory instructions only.
1740   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1741                                 std::pair<InstWidening, unsigned>>;
1742 
1743   DecisionList WideningDecisions;
1744 
1745   /// Returns true if \p V is expected to be vectorized and it needs to be
1746   /// extracted.
1747   bool needsExtract(Value *V, ElementCount VF) const {
1748     Instruction *I = dyn_cast<Instruction>(V);
1749     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1750         TheLoop->isLoopInvariant(I))
1751       return false;
1752 
1753     // Assume we can vectorize V (and hence we need extraction) if the
1754     // scalars are not computed yet. This can happen, because it is called
1755     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1756     // the scalars are collected. That should be a safe assumption in most
1757     // cases, because we check if the operands have vectorizable types
1758     // beforehand in LoopVectorizationLegality.
1759     return Scalars.find(VF) == Scalars.end() ||
1760            !isScalarAfterVectorization(I, VF);
1761   };
1762 
1763   /// Returns a range containing only operands needing to be extracted.
1764   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1765                                                    ElementCount VF) {
1766     return SmallVector<Value *, 4>(make_filter_range(
1767         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1768   }
1769 
1770   /// Determines if we have the infrastructure to vectorize loop \p L and its
1771   /// epilogue, assuming the main loop is vectorized by \p VF.
1772   bool isCandidateForEpilogueVectorization(const Loop &L,
1773                                            const ElementCount VF) const;
1774 
1775   /// Returns true if epilogue vectorization is considered profitable, and
1776   /// false otherwise.
1777   /// \p VF is the vectorization factor chosen for the original loop.
1778   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1779 
1780 public:
1781   /// The loop that we evaluate.
1782   Loop *TheLoop;
1783 
1784   /// Predicated scalar evolution analysis.
1785   PredicatedScalarEvolution &PSE;
1786 
1787   /// Loop Info analysis.
1788   LoopInfo *LI;
1789 
1790   /// Vectorization legality.
1791   LoopVectorizationLegality *Legal;
1792 
1793   /// Vector target information.
1794   const TargetTransformInfo &TTI;
1795 
1796   /// Target Library Info.
1797   const TargetLibraryInfo *TLI;
1798 
1799   /// Demanded bits analysis.
1800   DemandedBits *DB;
1801 
1802   /// Assumption cache.
1803   AssumptionCache *AC;
1804 
1805   /// Interface to emit optimization remarks.
1806   OptimizationRemarkEmitter *ORE;
1807 
1808   const Function *TheFunction;
1809 
1810   /// Loop Vectorize Hint.
1811   const LoopVectorizeHints *Hints;
1812 
1813   /// The interleave access information contains groups of interleaved accesses
1814   /// with the same stride and close to each other.
1815   InterleavedAccessInfo &InterleaveInfo;
1816 
1817   /// Values to ignore in the cost model.
1818   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1819 
1820   /// Values to ignore in the cost model when VF > 1.
1821   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1822 
1823   /// Profitable vector factors.
1824   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1825 };
1826 
1827 } // end namespace llvm
1828 
1829 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1830 // vectorization. The loop needs to be annotated with #pragma omp simd
1831 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1832 // vector length information is not provided, vectorization is not considered
1833 // explicit. Interleave hints are not allowed either. These limitations will be
1834 // relaxed in the future.
1835 // Please, note that we are currently forced to abuse the pragma 'clang
1836 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1837 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1838 // provides *explicit vectorization hints* (LV can bypass legal checks and
1839 // assume that vectorization is legal). However, both hints are implemented
1840 // using the same metadata (llvm.loop.vectorize, processed by
1841 // LoopVectorizeHints). This will be fixed in the future when the native IR
1842 // representation for pragma 'omp simd' is introduced.
1843 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1844                                    OptimizationRemarkEmitter *ORE) {
1845   assert(!OuterLp->isInnermost() && "This is not an outer loop");
1846   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1847 
1848   // Only outer loops with an explicit vectorization hint are supported.
1849   // Unannotated outer loops are ignored.
1850   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1851     return false;
1852 
1853   Function *Fn = OuterLp->getHeader()->getParent();
1854   if (!Hints.allowVectorization(Fn, OuterLp,
1855                                 true /*VectorizeOnlyWhenForced*/)) {
1856     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1857     return false;
1858   }
1859 
1860   if (Hints.getInterleave() > 1) {
1861     // TODO: Interleave support is future work.
1862     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1863                          "outer loops.\n");
1864     Hints.emitRemarkWithHints();
1865     return false;
1866   }
1867 
1868   return true;
1869 }
1870 
1871 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1872                                   OptimizationRemarkEmitter *ORE,
1873                                   SmallVectorImpl<Loop *> &V) {
1874   // Collect inner loops and outer loops without irreducible control flow. For
1875   // now, only collect outer loops that have explicit vectorization hints. If we
1876   // are stress testing the VPlan H-CFG construction, we collect the outermost
1877   // loop of every loop nest.
1878   if (L.isInnermost() || VPlanBuildStressTest ||
1879       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1880     LoopBlocksRPO RPOT(&L);
1881     RPOT.perform(LI);
1882     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1883       V.push_back(&L);
1884       // TODO: Collect inner loops inside marked outer loops in case
1885       // vectorization fails for the outer loop. Do not invoke
1886       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1887       // already known to be reducible. We can use an inherited attribute for
1888       // that.
1889       return;
1890     }
1891   }
1892   for (Loop *InnerL : L)
1893     collectSupportedLoops(*InnerL, LI, ORE, V);
1894 }
1895 
1896 namespace {
1897 
1898 /// The LoopVectorize Pass.
1899 struct LoopVectorize : public FunctionPass {
1900   /// Pass identification, replacement for typeid
1901   static char ID;
1902 
1903   LoopVectorizePass Impl;
1904 
1905   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1906                          bool VectorizeOnlyWhenForced = false)
1907       : FunctionPass(ID),
1908         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1909     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1910   }
1911 
1912   bool runOnFunction(Function &F) override {
1913     if (skipFunction(F))
1914       return false;
1915 
1916     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1917     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1918     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1919     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1920     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1921     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1922     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1923     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1924     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1925     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1926     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1927     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1928     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1929 
1930     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1931         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1932 
1933     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1934                         GetLAA, *ORE, PSI).MadeAnyChange;
1935   }
1936 
1937   void getAnalysisUsage(AnalysisUsage &AU) const override {
1938     AU.addRequired<AssumptionCacheTracker>();
1939     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1940     AU.addRequired<DominatorTreeWrapperPass>();
1941     AU.addRequired<LoopInfoWrapperPass>();
1942     AU.addRequired<ScalarEvolutionWrapperPass>();
1943     AU.addRequired<TargetTransformInfoWrapperPass>();
1944     AU.addRequired<AAResultsWrapperPass>();
1945     AU.addRequired<LoopAccessLegacyAnalysis>();
1946     AU.addRequired<DemandedBitsWrapperPass>();
1947     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1948     AU.addRequired<InjectTLIMappingsLegacy>();
1949 
1950     // We currently do not preserve loopinfo/dominator analyses with outer loop
1951     // vectorization. Until this is addressed, mark these analyses as preserved
1952     // only for non-VPlan-native path.
1953     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1954     if (!EnableVPlanNativePath) {
1955       AU.addPreserved<LoopInfoWrapperPass>();
1956       AU.addPreserved<DominatorTreeWrapperPass>();
1957     }
1958 
1959     AU.addPreserved<BasicAAWrapperPass>();
1960     AU.addPreserved<GlobalsAAWrapperPass>();
1961     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1962   }
1963 };
1964 
1965 } // end anonymous namespace
1966 
1967 //===----------------------------------------------------------------------===//
1968 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1969 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1970 //===----------------------------------------------------------------------===//
1971 
1972 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1973   // We need to place the broadcast of invariant variables outside the loop,
1974   // but only if it's proven safe to do so. Else, broadcast will be inside
1975   // vector loop body.
1976   Instruction *Instr = dyn_cast<Instruction>(V);
1977   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1978                      (!Instr ||
1979                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1980   // Place the code for broadcasting invariant variables in the new preheader.
1981   IRBuilder<>::InsertPointGuard Guard(Builder);
1982   if (SafeToHoist)
1983     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1984 
1985   // Broadcast the scalar into all locations in the vector.
1986   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1987 
1988   return Shuf;
1989 }
1990 
1991 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1992     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1993   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1994          "Expected either an induction phi-node or a truncate of it!");
1995   Value *Start = II.getStartValue();
1996 
1997   // Construct the initial value of the vector IV in the vector loop preheader
1998   auto CurrIP = Builder.saveIP();
1999   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2000   if (isa<TruncInst>(EntryVal)) {
2001     assert(Start->getType()->isIntegerTy() &&
2002            "Truncation requires an integer type");
2003     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2004     Step = Builder.CreateTrunc(Step, TruncType);
2005     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2006   }
2007   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2008   Value *SteppedStart =
2009       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2010 
2011   // We create vector phi nodes for both integer and floating-point induction
2012   // variables. Here, we determine the kind of arithmetic we will perform.
2013   Instruction::BinaryOps AddOp;
2014   Instruction::BinaryOps MulOp;
2015   if (Step->getType()->isIntegerTy()) {
2016     AddOp = Instruction::Add;
2017     MulOp = Instruction::Mul;
2018   } else {
2019     AddOp = II.getInductionOpcode();
2020     MulOp = Instruction::FMul;
2021   }
2022 
2023   // Multiply the vectorization factor by the step using integer or
2024   // floating-point arithmetic as appropriate.
2025   Value *ConstVF =
2026       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
2027   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2028 
2029   // Create a vector splat to use in the induction update.
2030   //
2031   // FIXME: If the step is non-constant, we create the vector splat with
2032   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2033   //        handle a constant vector splat.
2034   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2035   Value *SplatVF = isa<Constant>(Mul)
2036                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2037                        : Builder.CreateVectorSplat(VF, Mul);
2038   Builder.restoreIP(CurrIP);
2039 
2040   // We may need to add the step a number of times, depending on the unroll
2041   // factor. The last of those goes into the PHI.
2042   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2043                                     &*LoopVectorBody->getFirstInsertionPt());
2044   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2045   Instruction *LastInduction = VecInd;
2046   for (unsigned Part = 0; Part < UF; ++Part) {
2047     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2048 
2049     if (isa<TruncInst>(EntryVal))
2050       addMetadata(LastInduction, EntryVal);
2051     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
2052 
2053     LastInduction = cast<Instruction>(addFastMathFlag(
2054         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2055     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2056   }
2057 
2058   // Move the last step to the end of the latch block. This ensures consistent
2059   // placement of all induction updates.
2060   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2061   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2062   auto *ICmp = cast<Instruction>(Br->getCondition());
2063   LastInduction->moveBefore(ICmp);
2064   LastInduction->setName("vec.ind.next");
2065 
2066   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2067   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2068 }
2069 
2070 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2071   return Cost->isScalarAfterVectorization(I, VF) ||
2072          Cost->isProfitableToScalarize(I, VF);
2073 }
2074 
2075 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2076   if (shouldScalarizeInstruction(IV))
2077     return true;
2078   auto isScalarInst = [&](User *U) -> bool {
2079     auto *I = cast<Instruction>(U);
2080     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2081   };
2082   return llvm::any_of(IV->users(), isScalarInst);
2083 }
2084 
2085 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2086     const InductionDescriptor &ID, const Instruction *EntryVal,
2087     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
2088   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2089          "Expected either an induction phi-node or a truncate of it!");
2090 
2091   // This induction variable is not the phi from the original loop but the
2092   // newly-created IV based on the proof that casted Phi is equal to the
2093   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2094   // re-uses the same InductionDescriptor that original IV uses but we don't
2095   // have to do any recording in this case - that is done when original IV is
2096   // processed.
2097   if (isa<TruncInst>(EntryVal))
2098     return;
2099 
2100   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2101   if (Casts.empty())
2102     return;
2103   // Only the first Cast instruction in the Casts vector is of interest.
2104   // The rest of the Casts (if exist) have no uses outside the
2105   // induction update chain itself.
2106   Instruction *CastInst = *Casts.begin();
2107   if (Lane < UINT_MAX)
2108     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
2109   else
2110     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
2111 }
2112 
2113 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
2114   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2115          "Primary induction variable must have an integer type");
2116 
2117   auto II = Legal->getInductionVars().find(IV);
2118   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2119 
2120   auto ID = II->second;
2121   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2122 
2123   // The value from the original loop to which we are mapping the new induction
2124   // variable.
2125   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2126 
2127   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2128 
2129   // Generate code for the induction step. Note that induction steps are
2130   // required to be loop-invariant
2131   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2132     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2133            "Induction step should be loop invariant");
2134     if (PSE.getSE()->isSCEVable(IV->getType())) {
2135       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2136       return Exp.expandCodeFor(Step, Step->getType(),
2137                                LoopVectorPreHeader->getTerminator());
2138     }
2139     return cast<SCEVUnknown>(Step)->getValue();
2140   };
2141 
2142   // The scalar value to broadcast. This is derived from the canonical
2143   // induction variable. If a truncation type is given, truncate the canonical
2144   // induction variable and step. Otherwise, derive these values from the
2145   // induction descriptor.
2146   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2147     Value *ScalarIV = Induction;
2148     if (IV != OldInduction) {
2149       ScalarIV = IV->getType()->isIntegerTy()
2150                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2151                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2152                                           IV->getType());
2153       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2154       ScalarIV->setName("offset.idx");
2155     }
2156     if (Trunc) {
2157       auto *TruncType = cast<IntegerType>(Trunc->getType());
2158       assert(Step->getType()->isIntegerTy() &&
2159              "Truncation requires an integer step");
2160       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2161       Step = Builder.CreateTrunc(Step, TruncType);
2162     }
2163     return ScalarIV;
2164   };
2165 
2166   // Create the vector values from the scalar IV, in the absence of creating a
2167   // vector IV.
2168   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2169     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2170     for (unsigned Part = 0; Part < UF; ++Part) {
2171       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2172       Value *EntryPart =
2173           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2174                         ID.getInductionOpcode());
2175       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2176       if (Trunc)
2177         addMetadata(EntryPart, Trunc);
2178       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2179     }
2180   };
2181 
2182   // Now do the actual transformations, and start with creating the step value.
2183   Value *Step = CreateStepValue(ID.getStep());
2184   if (VF.isZero() || VF.isScalar()) {
2185     Value *ScalarIV = CreateScalarIV(Step);
2186     CreateSplatIV(ScalarIV, Step);
2187     return;
2188   }
2189 
2190   // Determine if we want a scalar version of the induction variable. This is
2191   // true if the induction variable itself is not widened, or if it has at
2192   // least one user in the loop that is not widened.
2193   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2194   if (!NeedsScalarIV) {
2195     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2196     return;
2197   }
2198 
2199   // Try to create a new independent vector induction variable. If we can't
2200   // create the phi node, we will splat the scalar induction variable in each
2201   // loop iteration.
2202   if (!shouldScalarizeInstruction(EntryVal)) {
2203     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2204     Value *ScalarIV = CreateScalarIV(Step);
2205     // Create scalar steps that can be used by instructions we will later
2206     // scalarize. Note that the addition of the scalar steps will not increase
2207     // the number of instructions in the loop in the common case prior to
2208     // InstCombine. We will be trading one vector extract for each scalar step.
2209     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2210     return;
2211   }
2212 
2213   // All IV users are scalar instructions, so only emit a scalar IV, not a
2214   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2215   // predicate used by the masked loads/stores.
2216   Value *ScalarIV = CreateScalarIV(Step);
2217   if (!Cost->isScalarEpilogueAllowed())
2218     CreateSplatIV(ScalarIV, Step);
2219   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2220 }
2221 
2222 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2223                                           Instruction::BinaryOps BinOp) {
2224   // Create and check the types.
2225   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2226   int VLen = ValVTy->getNumElements();
2227 
2228   Type *STy = Val->getType()->getScalarType();
2229   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2230          "Induction Step must be an integer or FP");
2231   assert(Step->getType() == STy && "Step has wrong type");
2232 
2233   SmallVector<Constant *, 8> Indices;
2234 
2235   if (STy->isIntegerTy()) {
2236     // Create a vector of consecutive numbers from zero to VF.
2237     for (int i = 0; i < VLen; ++i)
2238       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2239 
2240     // Add the consecutive indices to the vector value.
2241     Constant *Cv = ConstantVector::get(Indices);
2242     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2243     Step = Builder.CreateVectorSplat(VLen, Step);
2244     assert(Step->getType() == Val->getType() && "Invalid step vec");
2245     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2246     // which can be found from the original scalar operations.
2247     Step = Builder.CreateMul(Cv, Step);
2248     return Builder.CreateAdd(Val, Step, "induction");
2249   }
2250 
2251   // Floating point induction.
2252   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2253          "Binary Opcode should be specified for FP induction");
2254   // Create a vector of consecutive numbers from zero to VF.
2255   for (int i = 0; i < VLen; ++i)
2256     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2257 
2258   // Add the consecutive indices to the vector value.
2259   Constant *Cv = ConstantVector::get(Indices);
2260 
2261   Step = Builder.CreateVectorSplat(VLen, Step);
2262 
2263   // Floating point operations had to be 'fast' to enable the induction.
2264   FastMathFlags Flags;
2265   Flags.setFast();
2266 
2267   Value *MulOp = Builder.CreateFMul(Cv, Step);
2268   if (isa<Instruction>(MulOp))
2269     // Have to check, MulOp may be a constant
2270     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2271 
2272   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2273   if (isa<Instruction>(BOp))
2274     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2275   return BOp;
2276 }
2277 
2278 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2279                                            Instruction *EntryVal,
2280                                            const InductionDescriptor &ID) {
2281   // We shouldn't have to build scalar steps if we aren't vectorizing.
2282   assert(VF.isVector() && "VF should be greater than one");
2283   assert(!VF.isScalable() &&
2284          "the code below assumes a fixed number of elements at compile time");
2285   // Get the value type and ensure it and the step have the same integer type.
2286   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2287   assert(ScalarIVTy == Step->getType() &&
2288          "Val and Step should have the same type");
2289 
2290   // We build scalar steps for both integer and floating-point induction
2291   // variables. Here, we determine the kind of arithmetic we will perform.
2292   Instruction::BinaryOps AddOp;
2293   Instruction::BinaryOps MulOp;
2294   if (ScalarIVTy->isIntegerTy()) {
2295     AddOp = Instruction::Add;
2296     MulOp = Instruction::Mul;
2297   } else {
2298     AddOp = ID.getInductionOpcode();
2299     MulOp = Instruction::FMul;
2300   }
2301 
2302   // Determine the number of scalars we need to generate for each unroll
2303   // iteration. If EntryVal is uniform, we only need to generate the first
2304   // lane. Otherwise, we generate all VF values.
2305   unsigned Lanes =
2306       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2307           ? 1
2308           : VF.getKnownMinValue();
2309   // Compute the scalar steps and save the results in VectorLoopValueMap.
2310   for (unsigned Part = 0; Part < UF; ++Part) {
2311     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2312       auto *StartIdx = getSignedIntOrFpConstant(
2313           ScalarIVTy, VF.getKnownMinValue() * Part + Lane);
2314       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2315       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2316       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2317       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2318     }
2319   }
2320 }
2321 
2322 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2323   assert(V != Induction && "The new induction variable should not be used.");
2324   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2325   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2326 
2327   // If we have a stride that is replaced by one, do it here. Defer this for
2328   // the VPlan-native path until we start running Legal checks in that path.
2329   if (!EnableVPlanNativePath && Legal->hasStride(V))
2330     V = ConstantInt::get(V->getType(), 1);
2331 
2332   // If we have a vector mapped to this value, return it.
2333   if (VectorLoopValueMap.hasVectorValue(V, Part))
2334     return VectorLoopValueMap.getVectorValue(V, Part);
2335 
2336   // If the value has not been vectorized, check if it has been scalarized
2337   // instead. If it has been scalarized, and we actually need the value in
2338   // vector form, we will construct the vector values on demand.
2339   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2340     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2341 
2342     // If we've scalarized a value, that value should be an instruction.
2343     auto *I = cast<Instruction>(V);
2344 
2345     // If we aren't vectorizing, we can just copy the scalar map values over to
2346     // the vector map.
2347     if (VF.isScalar()) {
2348       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2349       return ScalarValue;
2350     }
2351 
2352     // Get the last scalar instruction we generated for V and Part. If the value
2353     // is known to be uniform after vectorization, this corresponds to lane zero
2354     // of the Part unroll iteration. Otherwise, the last instruction is the one
2355     // we created for the last vector lane of the Part unroll iteration.
2356     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2357     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2358                             ? 0
2359                             : VF.getKnownMinValue() - 1;
2360     auto *LastInst = cast<Instruction>(
2361         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2362 
2363     // Set the insert point after the last scalarized instruction. This ensures
2364     // the insertelement sequence will directly follow the scalar definitions.
2365     auto OldIP = Builder.saveIP();
2366     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2367     Builder.SetInsertPoint(&*NewIP);
2368 
2369     // However, if we are vectorizing, we need to construct the vector values.
2370     // If the value is known to be uniform after vectorization, we can just
2371     // broadcast the scalar value corresponding to lane zero for each unroll
2372     // iteration. Otherwise, we construct the vector values using insertelement
2373     // instructions. Since the resulting vectors are stored in
2374     // VectorLoopValueMap, we will only generate the insertelements once.
2375     Value *VectorValue = nullptr;
2376     if (Cost->isUniformAfterVectorization(I, VF)) {
2377       VectorValue = getBroadcastInstrs(ScalarValue);
2378       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2379     } else {
2380       // Initialize packing with insertelements to start from undef.
2381       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2382       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2383       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2384       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2385         packScalarIntoVectorValue(V, {Part, Lane});
2386       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2387     }
2388     Builder.restoreIP(OldIP);
2389     return VectorValue;
2390   }
2391 
2392   // If this scalar is unknown, assume that it is a constant or that it is
2393   // loop invariant. Broadcast V and save the value for future uses.
2394   Value *B = getBroadcastInstrs(V);
2395   VectorLoopValueMap.setVectorValue(V, Part, B);
2396   return B;
2397 }
2398 
2399 Value *
2400 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2401                                             const VPIteration &Instance) {
2402   // If the value is not an instruction contained in the loop, it should
2403   // already be scalar.
2404   if (OrigLoop->isLoopInvariant(V))
2405     return V;
2406 
2407   assert(Instance.Lane > 0
2408              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2409              : true && "Uniform values only have lane zero");
2410 
2411   // If the value from the original loop has not been vectorized, it is
2412   // represented by UF x VF scalar values in the new loop. Return the requested
2413   // scalar value.
2414   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2415     return VectorLoopValueMap.getScalarValue(V, Instance);
2416 
2417   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2418   // for the given unroll part. If this entry is not a vector type (i.e., the
2419   // vectorization factor is one), there is no need to generate an
2420   // extractelement instruction.
2421   auto *U = getOrCreateVectorValue(V, Instance.Part);
2422   if (!U->getType()->isVectorTy()) {
2423     assert(VF.isScalar() && "Value not scalarized has non-vector type");
2424     return U;
2425   }
2426 
2427   // Otherwise, the value from the original loop has been vectorized and is
2428   // represented by UF vector values. Extract and return the requested scalar
2429   // value from the appropriate vector lane.
2430   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2431 }
2432 
2433 void InnerLoopVectorizer::packScalarIntoVectorValue(
2434     Value *V, const VPIteration &Instance) {
2435   assert(V != Induction && "The new induction variable should not be used.");
2436   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2437   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2438 
2439   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2440   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2441   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2442                                             Builder.getInt32(Instance.Lane));
2443   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2444 }
2445 
2446 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2447   assert(Vec->getType()->isVectorTy() && "Invalid type");
2448   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2449   SmallVector<int, 8> ShuffleMask;
2450   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2451     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2452 
2453   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2454 }
2455 
2456 // Return whether we allow using masked interleave-groups (for dealing with
2457 // strided loads/stores that reside in predicated blocks, or for dealing
2458 // with gaps).
2459 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2460   // If an override option has been passed in for interleaved accesses, use it.
2461   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2462     return EnableMaskedInterleavedMemAccesses;
2463 
2464   return TTI.enableMaskedInterleavedAccessVectorization();
2465 }
2466 
2467 // Try to vectorize the interleave group that \p Instr belongs to.
2468 //
2469 // E.g. Translate following interleaved load group (factor = 3):
2470 //   for (i = 0; i < N; i+=3) {
2471 //     R = Pic[i];             // Member of index 0
2472 //     G = Pic[i+1];           // Member of index 1
2473 //     B = Pic[i+2];           // Member of index 2
2474 //     ... // do something to R, G, B
2475 //   }
2476 // To:
2477 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2478 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2479 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2480 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2481 //
2482 // Or translate following interleaved store group (factor = 3):
2483 //   for (i = 0; i < N; i+=3) {
2484 //     ... do something to R, G, B
2485 //     Pic[i]   = R;           // Member of index 0
2486 //     Pic[i+1] = G;           // Member of index 1
2487 //     Pic[i+2] = B;           // Member of index 2
2488 //   }
2489 // To:
2490 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2491 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2492 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2493 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2494 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2495 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2496     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2497     VPValue *Addr, ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask) {
2498   Instruction *Instr = Group->getInsertPos();
2499   const DataLayout &DL = Instr->getModule()->getDataLayout();
2500 
2501   // Prepare for the vector type of the interleaved load/store.
2502   Type *ScalarTy = getMemInstValueType(Instr);
2503   unsigned InterleaveFactor = Group->getFactor();
2504   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2505   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2506 
2507   // Prepare for the new pointers.
2508   SmallVector<Value *, 2> AddrParts;
2509   unsigned Index = Group->getIndex(Instr);
2510 
2511   // TODO: extend the masked interleaved-group support to reversed access.
2512   assert((!BlockInMask || !Group->isReverse()) &&
2513          "Reversed masked interleave-group not supported.");
2514 
2515   // If the group is reverse, adjust the index to refer to the last vector lane
2516   // instead of the first. We adjust the index from the first vector lane,
2517   // rather than directly getting the pointer for lane VF - 1, because the
2518   // pointer operand of the interleaved access is supposed to be uniform. For
2519   // uniform instructions, we're only required to generate a value for the
2520   // first vector lane in each unroll iteration.
2521   assert(!VF.isScalable() &&
2522          "scalable vector reverse operation is not implemented");
2523   if (Group->isReverse())
2524     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2525 
2526   for (unsigned Part = 0; Part < UF; Part++) {
2527     Value *AddrPart = State.get(Addr, {Part, 0});
2528     setDebugLocFromInst(Builder, AddrPart);
2529 
2530     // Notice current instruction could be any index. Need to adjust the address
2531     // to the member of index 0.
2532     //
2533     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2534     //       b = A[i];       // Member of index 0
2535     // Current pointer is pointed to A[i+1], adjust it to A[i].
2536     //
2537     // E.g.  A[i+1] = a;     // Member of index 1
2538     //       A[i]   = b;     // Member of index 0
2539     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2540     // Current pointer is pointed to A[i+2], adjust it to A[i].
2541 
2542     bool InBounds = false;
2543     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2544       InBounds = gep->isInBounds();
2545     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2546     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2547 
2548     // Cast to the vector pointer type.
2549     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2550     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2551     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2552   }
2553 
2554   setDebugLocFromInst(Builder, Instr);
2555   Value *UndefVec = UndefValue::get(VecTy);
2556 
2557   Value *MaskForGaps = nullptr;
2558   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2559     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2560     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2561     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2562   }
2563 
2564   // Vectorize the interleaved load group.
2565   if (isa<LoadInst>(Instr)) {
2566     // For each unroll part, create a wide load for the group.
2567     SmallVector<Value *, 2> NewLoads;
2568     for (unsigned Part = 0; Part < UF; Part++) {
2569       Instruction *NewLoad;
2570       if (BlockInMask || MaskForGaps) {
2571         assert(useMaskedInterleavedAccesses(*TTI) &&
2572                "masked interleaved groups are not allowed.");
2573         Value *GroupMask = MaskForGaps;
2574         if (BlockInMask) {
2575           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2576           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2577           Value *ShuffledMask = Builder.CreateShuffleVector(
2578               BlockInMaskPart,
2579               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2580               "interleaved.mask");
2581           GroupMask = MaskForGaps
2582                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2583                                                 MaskForGaps)
2584                           : ShuffledMask;
2585         }
2586         NewLoad =
2587             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2588                                      GroupMask, UndefVec, "wide.masked.vec");
2589       }
2590       else
2591         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2592                                             Group->getAlign(), "wide.vec");
2593       Group->addMetadata(NewLoad);
2594       NewLoads.push_back(NewLoad);
2595     }
2596 
2597     // For each member in the group, shuffle out the appropriate data from the
2598     // wide loads.
2599     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2600       Instruction *Member = Group->getMember(I);
2601 
2602       // Skip the gaps in the group.
2603       if (!Member)
2604         continue;
2605 
2606       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2607       auto StrideMask =
2608           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2609       for (unsigned Part = 0; Part < UF; Part++) {
2610         Value *StridedVec = Builder.CreateShuffleVector(
2611             NewLoads[Part], StrideMask, "strided.vec");
2612 
2613         // If this member has different type, cast the result type.
2614         if (Member->getType() != ScalarTy) {
2615           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2616           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2617           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2618         }
2619 
2620         if (Group->isReverse())
2621           StridedVec = reverseVector(StridedVec);
2622 
2623         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2624       }
2625     }
2626     return;
2627   }
2628 
2629   // The sub vector type for current instruction.
2630   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2631   auto *SubVT = VectorType::get(ScalarTy, VF);
2632 
2633   // Vectorize the interleaved store group.
2634   for (unsigned Part = 0; Part < UF; Part++) {
2635     // Collect the stored vector from each member.
2636     SmallVector<Value *, 4> StoredVecs;
2637     for (unsigned i = 0; i < InterleaveFactor; i++) {
2638       // Interleaved store group doesn't allow a gap, so each index has a member
2639       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2640 
2641       Value *StoredVec = State.get(StoredValues[i], Part);
2642 
2643       if (Group->isReverse())
2644         StoredVec = reverseVector(StoredVec);
2645 
2646       // If this member has different type, cast it to a unified type.
2647 
2648       if (StoredVec->getType() != SubVT)
2649         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2650 
2651       StoredVecs.push_back(StoredVec);
2652     }
2653 
2654     // Concatenate all vectors into a wide vector.
2655     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2656 
2657     // Interleave the elements in the wide vector.
2658     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2659     Value *IVec = Builder.CreateShuffleVector(
2660         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2661         "interleaved.vec");
2662 
2663     Instruction *NewStoreInstr;
2664     if (BlockInMask) {
2665       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2666       Value *ShuffledMask = Builder.CreateShuffleVector(
2667           BlockInMaskPart,
2668           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2669           "interleaved.mask");
2670       NewStoreInstr = Builder.CreateMaskedStore(
2671           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2672     }
2673     else
2674       NewStoreInstr =
2675           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2676 
2677     Group->addMetadata(NewStoreInstr);
2678   }
2679 }
2680 
2681 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2682     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2683     VPValue *StoredValue, VPValue *BlockInMask) {
2684   // Attempt to issue a wide load.
2685   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2686   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2687 
2688   assert((LI || SI) && "Invalid Load/Store instruction");
2689   assert((!SI || StoredValue) && "No stored value provided for widened store");
2690   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2691 
2692   LoopVectorizationCostModel::InstWidening Decision =
2693       Cost->getWideningDecision(Instr, VF);
2694   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2695           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2696           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2697          "CM decision is not to widen the memory instruction");
2698 
2699   Type *ScalarDataTy = getMemInstValueType(Instr);
2700 
2701   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2702   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2703   const Align Alignment = getLoadStoreAlignment(Instr);
2704 
2705   // Determine if the pointer operand of the access is either consecutive or
2706   // reverse consecutive.
2707   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2708   bool ConsecutiveStride =
2709       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2710   bool CreateGatherScatter =
2711       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2712 
2713   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2714   // gather/scatter. Otherwise Decision should have been to Scalarize.
2715   assert((ConsecutiveStride || CreateGatherScatter) &&
2716          "The instruction should be scalarized");
2717   (void)ConsecutiveStride;
2718 
2719   VectorParts BlockInMaskParts(UF);
2720   bool isMaskRequired = BlockInMask;
2721   if (isMaskRequired)
2722     for (unsigned Part = 0; Part < UF; ++Part)
2723       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2724 
2725   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2726     // Calculate the pointer for the specific unroll-part.
2727     GetElementPtrInst *PartPtr = nullptr;
2728 
2729     bool InBounds = false;
2730     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2731       InBounds = gep->isInBounds();
2732 
2733     if (Reverse) {
2734       // If the address is consecutive but reversed, then the
2735       // wide store needs to start at the last vector element.
2736       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2737           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2738       PartPtr->setIsInBounds(InBounds);
2739       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2740           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2741       PartPtr->setIsInBounds(InBounds);
2742       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2743         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2744     } else {
2745       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2746           ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));
2747       PartPtr->setIsInBounds(InBounds);
2748     }
2749 
2750     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2751     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2752   };
2753 
2754   // Handle Stores:
2755   if (SI) {
2756     setDebugLocFromInst(Builder, SI);
2757 
2758     for (unsigned Part = 0; Part < UF; ++Part) {
2759       Instruction *NewSI = nullptr;
2760       Value *StoredVal = State.get(StoredValue, Part);
2761       if (CreateGatherScatter) {
2762         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2763         Value *VectorGep = State.get(Addr, Part);
2764         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2765                                             MaskPart);
2766       } else {
2767         if (Reverse) {
2768           // If we store to reverse consecutive memory locations, then we need
2769           // to reverse the order of elements in the stored value.
2770           StoredVal = reverseVector(StoredVal);
2771           // We don't want to update the value in the map as it might be used in
2772           // another expression. So don't call resetVectorValue(StoredVal).
2773         }
2774         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2775         if (isMaskRequired)
2776           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2777                                             BlockInMaskParts[Part]);
2778         else
2779           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2780       }
2781       addMetadata(NewSI, SI);
2782     }
2783     return;
2784   }
2785 
2786   // Handle loads.
2787   assert(LI && "Must have a load instruction");
2788   setDebugLocFromInst(Builder, LI);
2789   for (unsigned Part = 0; Part < UF; ++Part) {
2790     Value *NewLI;
2791     if (CreateGatherScatter) {
2792       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2793       Value *VectorGep = State.get(Addr, Part);
2794       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2795                                          nullptr, "wide.masked.gather");
2796       addMetadata(NewLI, LI);
2797     } else {
2798       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2799       if (isMaskRequired)
2800         NewLI = Builder.CreateMaskedLoad(
2801             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2802             "wide.masked.load");
2803       else
2804         NewLI =
2805             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2806 
2807       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2808       addMetadata(NewLI, LI);
2809       if (Reverse)
2810         NewLI = reverseVector(NewLI);
2811     }
2812 
2813     State.set(Def, Instr, NewLI, Part);
2814   }
2815 }
2816 
2817 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2818                                                const VPIteration &Instance,
2819                                                bool IfPredicateInstr,
2820                                                VPTransformState &State) {
2821   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2822 
2823   setDebugLocFromInst(Builder, Instr);
2824 
2825   // Does this instruction return a value ?
2826   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2827 
2828   Instruction *Cloned = Instr->clone();
2829   if (!IsVoidRetTy)
2830     Cloned->setName(Instr->getName() + ".cloned");
2831 
2832   // Replace the operands of the cloned instructions with their scalar
2833   // equivalents in the new loop.
2834   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2835     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2836     auto InputInstance = Instance;
2837     if (!Operand || !OrigLoop->contains(Operand) ||
2838         (Cost->isUniformAfterVectorization(Operand, State.VF)))
2839       InputInstance.Lane = 0;
2840     auto *NewOp = State.get(User.getOperand(op), InputInstance);
2841     Cloned->setOperand(op, NewOp);
2842   }
2843   addNewMetadata(Cloned, Instr);
2844 
2845   // Place the cloned scalar in the new loop.
2846   Builder.Insert(Cloned);
2847 
2848   // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
2849   // representing scalar values in VPTransformState. Add the cloned scalar to
2850   // the scalar map entry.
2851   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2852 
2853   // If we just cloned a new assumption, add it the assumption cache.
2854   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2855     if (II->getIntrinsicID() == Intrinsic::assume)
2856       AC->registerAssumption(II);
2857 
2858   // End if-block.
2859   if (IfPredicateInstr)
2860     PredicatedInstructions.push_back(Cloned);
2861 }
2862 
2863 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2864                                                       Value *End, Value *Step,
2865                                                       Instruction *DL) {
2866   BasicBlock *Header = L->getHeader();
2867   BasicBlock *Latch = L->getLoopLatch();
2868   // As we're just creating this loop, it's possible no latch exists
2869   // yet. If so, use the header as this will be a single block loop.
2870   if (!Latch)
2871     Latch = Header;
2872 
2873   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2874   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2875   setDebugLocFromInst(Builder, OldInst);
2876   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2877 
2878   Builder.SetInsertPoint(Latch->getTerminator());
2879   setDebugLocFromInst(Builder, OldInst);
2880 
2881   // Create i+1 and fill the PHINode.
2882   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2883   Induction->addIncoming(Start, L->getLoopPreheader());
2884   Induction->addIncoming(Next, Latch);
2885   // Create the compare.
2886   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2887   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2888 
2889   // Now we have two terminators. Remove the old one from the block.
2890   Latch->getTerminator()->eraseFromParent();
2891 
2892   return Induction;
2893 }
2894 
2895 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2896   if (TripCount)
2897     return TripCount;
2898 
2899   assert(L && "Create Trip Count for null loop.");
2900   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2901   // Find the loop boundaries.
2902   ScalarEvolution *SE = PSE.getSE();
2903   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2904   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2905          "Invalid loop count");
2906 
2907   Type *IdxTy = Legal->getWidestInductionType();
2908   assert(IdxTy && "No type for induction");
2909 
2910   // The exit count might have the type of i64 while the phi is i32. This can
2911   // happen if we have an induction variable that is sign extended before the
2912   // compare. The only way that we get a backedge taken count is that the
2913   // induction variable was signed and as such will not overflow. In such a case
2914   // truncation is legal.
2915   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2916       IdxTy->getPrimitiveSizeInBits())
2917     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2918   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2919 
2920   // Get the total trip count from the count by adding 1.
2921   const SCEV *ExitCount = SE->getAddExpr(
2922       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2923 
2924   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2925 
2926   // Expand the trip count and place the new instructions in the preheader.
2927   // Notice that the pre-header does not change, only the loop body.
2928   SCEVExpander Exp(*SE, DL, "induction");
2929 
2930   // Count holds the overall loop count (N).
2931   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2932                                 L->getLoopPreheader()->getTerminator());
2933 
2934   if (TripCount->getType()->isPointerTy())
2935     TripCount =
2936         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2937                                     L->getLoopPreheader()->getTerminator());
2938 
2939   return TripCount;
2940 }
2941 
2942 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2943   if (VectorTripCount)
2944     return VectorTripCount;
2945 
2946   Value *TC = getOrCreateTripCount(L);
2947   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2948 
2949   Type *Ty = TC->getType();
2950   // This is where we can make the step a runtime constant.
2951   assert(!VF.isScalable() && "scalable vectorization is not supported yet");
2952   Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF);
2953 
2954   // If the tail is to be folded by masking, round the number of iterations N
2955   // up to a multiple of Step instead of rounding down. This is done by first
2956   // adding Step-1 and then rounding down. Note that it's ok if this addition
2957   // overflows: the vector induction variable will eventually wrap to zero given
2958   // that it starts at zero and its Step is a power of two; the loop will then
2959   // exit, with the last early-exit vector comparison also producing all-true.
2960   if (Cost->foldTailByMasking()) {
2961     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2962            "VF*UF must be a power of 2 when folding tail by masking");
2963     TC = Builder.CreateAdd(
2964         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2965   }
2966 
2967   // Now we need to generate the expression for the part of the loop that the
2968   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2969   // iterations are not required for correctness, or N - Step, otherwise. Step
2970   // is equal to the vectorization factor (number of SIMD elements) times the
2971   // unroll factor (number of SIMD instructions).
2972   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2973 
2974   // If there is a non-reversed interleaved group that may speculatively access
2975   // memory out-of-bounds, we need to ensure that there will be at least one
2976   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2977   // the trip count, we set the remainder to be equal to the step. If the step
2978   // does not evenly divide the trip count, no adjustment is necessary since
2979   // there will already be scalar iterations. Note that the minimum iterations
2980   // check ensures that N >= Step.
2981   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
2982     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2983     R = Builder.CreateSelect(IsZero, Step, R);
2984   }
2985 
2986   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2987 
2988   return VectorTripCount;
2989 }
2990 
2991 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2992                                                    const DataLayout &DL) {
2993   // Verify that V is a vector type with same number of elements as DstVTy.
2994   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2995   unsigned VF = DstFVTy->getNumElements();
2996   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2997   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2998   Type *SrcElemTy = SrcVecTy->getElementType();
2999   Type *DstElemTy = DstFVTy->getElementType();
3000   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3001          "Vector elements must have same size");
3002 
3003   // Do a direct cast if element types are castable.
3004   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3005     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3006   }
3007   // V cannot be directly casted to desired vector type.
3008   // May happen when V is a floating point vector but DstVTy is a vector of
3009   // pointers or vice-versa. Handle this using a two-step bitcast using an
3010   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3011   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3012          "Only one type should be a pointer type");
3013   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3014          "Only one type should be a floating point type");
3015   Type *IntTy =
3016       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3017   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3018   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3019   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3020 }
3021 
3022 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3023                                                          BasicBlock *Bypass) {
3024   Value *Count = getOrCreateTripCount(L);
3025   // Reuse existing vector loop preheader for TC checks.
3026   // Note that new preheader block is generated for vector loop.
3027   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3028   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3029 
3030   // Generate code to check if the loop's trip count is less than VF * UF, or
3031   // equal to it in case a scalar epilogue is required; this implies that the
3032   // vector trip count is zero. This check also covers the case where adding one
3033   // to the backedge-taken count overflowed leading to an incorrect trip count
3034   // of zero. In this case we will also jump to the scalar loop.
3035   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3036                                           : ICmpInst::ICMP_ULT;
3037 
3038   // If tail is to be folded, vector loop takes care of all iterations.
3039   Value *CheckMinIters = Builder.getFalse();
3040   if (!Cost->foldTailByMasking()) {
3041     assert(!VF.isScalable() && "scalable vectors not yet supported.");
3042     CheckMinIters = Builder.CreateICmp(
3043         P, Count,
3044         ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
3045         "min.iters.check");
3046   }
3047   // Create new preheader for vector loop.
3048   LoopVectorPreHeader =
3049       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3050                  "vector.ph");
3051 
3052   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3053                                DT->getNode(Bypass)->getIDom()) &&
3054          "TC check is expected to dominate Bypass");
3055 
3056   // Update dominator for Bypass & LoopExit.
3057   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3058   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3059 
3060   ReplaceInstWithInst(
3061       TCCheckBlock->getTerminator(),
3062       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3063   LoopBypassBlocks.push_back(TCCheckBlock);
3064 }
3065 
3066 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3067   // Reuse existing vector loop preheader for SCEV checks.
3068   // Note that new preheader block is generated for vector loop.
3069   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
3070 
3071   // Generate the code to check that the SCEV assumptions that we made.
3072   // We want the new basic block to start at the first instruction in a
3073   // sequence of instructions that form a check.
3074   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3075                    "scev.check");
3076   Value *SCEVCheck = Exp.expandCodeForPredicate(
3077       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
3078 
3079   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3080     if (C->isZero())
3081       return;
3082 
3083   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3084            (OptForSizeBasedOnProfile &&
3085             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3086          "Cannot SCEV check stride or overflow when optimizing for size");
3087 
3088   SCEVCheckBlock->setName("vector.scevcheck");
3089   // Create new preheader for vector loop.
3090   LoopVectorPreHeader =
3091       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
3092                  nullptr, "vector.ph");
3093 
3094   // Update dominator only if this is first RT check.
3095   if (LoopBypassBlocks.empty()) {
3096     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3097     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3098   }
3099 
3100   ReplaceInstWithInst(
3101       SCEVCheckBlock->getTerminator(),
3102       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
3103   LoopBypassBlocks.push_back(SCEVCheckBlock);
3104   AddedSafetyChecks = true;
3105 }
3106 
3107 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3108   // VPlan-native path does not do any analysis for runtime checks currently.
3109   if (EnableVPlanNativePath)
3110     return;
3111 
3112   // Reuse existing vector loop preheader for runtime memory checks.
3113   // Note that new preheader block is generated for vector loop.
3114   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
3115 
3116   // Generate the code that checks in runtime if arrays overlap. We put the
3117   // checks into a separate block to make the more common case of few elements
3118   // faster.
3119   auto *LAI = Legal->getLAI();
3120   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
3121   if (!RtPtrChecking.Need)
3122     return;
3123 
3124   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3125     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3126            "Cannot emit memory checks when optimizing for size, unless forced "
3127            "to vectorize.");
3128     ORE->emit([&]() {
3129       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3130                                         L->getStartLoc(), L->getHeader())
3131              << "Code-size may be reduced by not forcing "
3132                 "vectorization, or by source-code modifications "
3133                 "eliminating the need for runtime checks "
3134                 "(e.g., adding 'restrict').";
3135     });
3136   }
3137 
3138   MemCheckBlock->setName("vector.memcheck");
3139   // Create new preheader for vector loop.
3140   LoopVectorPreHeader =
3141       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
3142                  "vector.ph");
3143 
3144   auto *CondBranch = cast<BranchInst>(
3145       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
3146   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
3147   LoopBypassBlocks.push_back(MemCheckBlock);
3148   AddedSafetyChecks = true;
3149 
3150   // Update dominator only if this is first RT check.
3151   if (LoopBypassBlocks.empty()) {
3152     DT->changeImmediateDominator(Bypass, MemCheckBlock);
3153     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
3154   }
3155 
3156   Instruction *FirstCheckInst;
3157   Instruction *MemRuntimeCheck;
3158   std::tie(FirstCheckInst, MemRuntimeCheck) =
3159       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
3160                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
3161   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
3162                             "claimed checks are required");
3163   CondBranch->setCondition(MemRuntimeCheck);
3164 
3165   // We currently don't use LoopVersioning for the actual loop cloning but we
3166   // still use it to add the noalias metadata.
3167   LVer = std::make_unique<LoopVersioning>(
3168       *Legal->getLAI(),
3169       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3170       DT, PSE.getSE());
3171   LVer->prepareNoAliasMetadata();
3172 }
3173 
3174 Value *InnerLoopVectorizer::emitTransformedIndex(
3175     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3176     const InductionDescriptor &ID) const {
3177 
3178   SCEVExpander Exp(*SE, DL, "induction");
3179   auto Step = ID.getStep();
3180   auto StartValue = ID.getStartValue();
3181   assert(Index->getType() == Step->getType() &&
3182          "Index type does not match StepValue type");
3183 
3184   // Note: the IR at this point is broken. We cannot use SE to create any new
3185   // SCEV and then expand it, hoping that SCEV's simplification will give us
3186   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3187   // lead to various SCEV crashes. So all we can do is to use builder and rely
3188   // on InstCombine for future simplifications. Here we handle some trivial
3189   // cases only.
3190   auto CreateAdd = [&B](Value *X, Value *Y) {
3191     assert(X->getType() == Y->getType() && "Types don't match!");
3192     if (auto *CX = dyn_cast<ConstantInt>(X))
3193       if (CX->isZero())
3194         return Y;
3195     if (auto *CY = dyn_cast<ConstantInt>(Y))
3196       if (CY->isZero())
3197         return X;
3198     return B.CreateAdd(X, Y);
3199   };
3200 
3201   auto CreateMul = [&B](Value *X, Value *Y) {
3202     assert(X->getType() == Y->getType() && "Types don't match!");
3203     if (auto *CX = dyn_cast<ConstantInt>(X))
3204       if (CX->isOne())
3205         return Y;
3206     if (auto *CY = dyn_cast<ConstantInt>(Y))
3207       if (CY->isOne())
3208         return X;
3209     return B.CreateMul(X, Y);
3210   };
3211 
3212   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3213   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3214   // the DomTree is not kept up-to-date for additional blocks generated in the
3215   // vector loop. By using the header as insertion point, we guarantee that the
3216   // expanded instructions dominate all their uses.
3217   auto GetInsertPoint = [this, &B]() {
3218     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3219     if (InsertBB != LoopVectorBody &&
3220         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3221       return LoopVectorBody->getTerminator();
3222     return &*B.GetInsertPoint();
3223   };
3224   switch (ID.getKind()) {
3225   case InductionDescriptor::IK_IntInduction: {
3226     assert(Index->getType() == StartValue->getType() &&
3227            "Index type does not match StartValue type");
3228     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3229       return B.CreateSub(StartValue, Index);
3230     auto *Offset = CreateMul(
3231         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3232     return CreateAdd(StartValue, Offset);
3233   }
3234   case InductionDescriptor::IK_PtrInduction: {
3235     assert(isa<SCEVConstant>(Step) &&
3236            "Expected constant step for pointer induction");
3237     return B.CreateGEP(
3238         StartValue->getType()->getPointerElementType(), StartValue,
3239         CreateMul(Index,
3240                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3241   }
3242   case InductionDescriptor::IK_FpInduction: {
3243     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3244     auto InductionBinOp = ID.getInductionBinOp();
3245     assert(InductionBinOp &&
3246            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3247             InductionBinOp->getOpcode() == Instruction::FSub) &&
3248            "Original bin op should be defined for FP induction");
3249 
3250     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3251 
3252     // Floating point operations had to be 'fast' to enable the induction.
3253     FastMathFlags Flags;
3254     Flags.setFast();
3255 
3256     Value *MulExp = B.CreateFMul(StepValue, Index);
3257     if (isa<Instruction>(MulExp))
3258       // We have to check, the MulExp may be a constant.
3259       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3260 
3261     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3262                                "induction");
3263     if (isa<Instruction>(BOp))
3264       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3265 
3266     return BOp;
3267   }
3268   case InductionDescriptor::IK_NoInduction:
3269     return nullptr;
3270   }
3271   llvm_unreachable("invalid enum");
3272 }
3273 
3274 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3275   LoopScalarBody = OrigLoop->getHeader();
3276   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3277   LoopExitBlock = OrigLoop->getExitBlock();
3278   assert(LoopExitBlock && "Must have an exit block");
3279   assert(LoopVectorPreHeader && "Invalid loop structure");
3280 
3281   LoopMiddleBlock =
3282       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3283                  LI, nullptr, Twine(Prefix) + "middle.block");
3284   LoopScalarPreHeader =
3285       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3286                  nullptr, Twine(Prefix) + "scalar.ph");
3287   // We intentionally don't let SplitBlock to update LoopInfo since
3288   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3289   // LoopVectorBody is explicitly added to the correct place few lines later.
3290   LoopVectorBody =
3291       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3292                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3293 
3294   // Update dominator for loop exit.
3295   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3296 
3297   // Create and register the new vector loop.
3298   Loop *Lp = LI->AllocateLoop();
3299   Loop *ParentLoop = OrigLoop->getParentLoop();
3300 
3301   // Insert the new loop into the loop nest and register the new basic blocks
3302   // before calling any utilities such as SCEV that require valid LoopInfo.
3303   if (ParentLoop) {
3304     ParentLoop->addChildLoop(Lp);
3305   } else {
3306     LI->addTopLevelLoop(Lp);
3307   }
3308   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3309   return Lp;
3310 }
3311 
3312 void InnerLoopVectorizer::createInductionResumeValues(
3313     Loop *L, Value *VectorTripCount,
3314     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3315   assert(VectorTripCount && L && "Expected valid arguments");
3316   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3317           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3318          "Inconsistent information about additional bypass.");
3319   // We are going to resume the execution of the scalar loop.
3320   // Go over all of the induction variables that we found and fix the
3321   // PHIs that are left in the scalar version of the loop.
3322   // The starting values of PHI nodes depend on the counter of the last
3323   // iteration in the vectorized loop.
3324   // If we come from a bypass edge then we need to start from the original
3325   // start value.
3326   for (auto &InductionEntry : Legal->getInductionVars()) {
3327     PHINode *OrigPhi = InductionEntry.first;
3328     InductionDescriptor II = InductionEntry.second;
3329 
3330     // Create phi nodes to merge from the  backedge-taken check block.
3331     PHINode *BCResumeVal =
3332         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3333                         LoopScalarPreHeader->getTerminator());
3334     // Copy original phi DL over to the new one.
3335     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3336     Value *&EndValue = IVEndValues[OrigPhi];
3337     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3338     if (OrigPhi == OldInduction) {
3339       // We know what the end value is.
3340       EndValue = VectorTripCount;
3341     } else {
3342       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3343       Type *StepType = II.getStep()->getType();
3344       Instruction::CastOps CastOp =
3345           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3346       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3347       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3348       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3349       EndValue->setName("ind.end");
3350 
3351       // Compute the end value for the additional bypass (if applicable).
3352       if (AdditionalBypass.first) {
3353         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3354         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3355                                          StepType, true);
3356         CRD =
3357             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3358         EndValueFromAdditionalBypass =
3359             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3360         EndValueFromAdditionalBypass->setName("ind.end");
3361       }
3362     }
3363     // The new PHI merges the original incoming value, in case of a bypass,
3364     // or the value at the end of the vectorized loop.
3365     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3366 
3367     // Fix the scalar body counter (PHI node).
3368     // The old induction's phi node in the scalar body needs the truncated
3369     // value.
3370     for (BasicBlock *BB : LoopBypassBlocks)
3371       BCResumeVal->addIncoming(II.getStartValue(), BB);
3372 
3373     if (AdditionalBypass.first)
3374       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3375                                             EndValueFromAdditionalBypass);
3376 
3377     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3378   }
3379 }
3380 
3381 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3382                                                       MDNode *OrigLoopID) {
3383   assert(L && "Expected valid loop.");
3384 
3385   // The trip counts should be cached by now.
3386   Value *Count = getOrCreateTripCount(L);
3387   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3388 
3389   // We need the OrigLoop (scalar loop part) latch terminator to help
3390   // produce correct debug info for the middle block BB instructions.
3391   // The legality check stage guarantees that the loop will have a single
3392   // latch.
3393   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3394          "Scalar loop latch terminator isn't a branch");
3395   BranchInst *ScalarLatchBr =
3396       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3397 
3398   // Add a check in the middle block to see if we have completed
3399   // all of the iterations in the first vector loop.
3400   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3401   // If tail is to be folded, we know we don't need to run the remainder.
3402   Value *CmpN = Builder.getTrue();
3403   if (!Cost->foldTailByMasking()) {
3404     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3405                            VectorTripCount, "cmp.n",
3406                            LoopMiddleBlock->getTerminator());
3407 
3408     // Here we use the same DebugLoc as the scalar loop latch branch instead
3409     // of the corresponding compare because they may have ended up with
3410     // different line numbers and we want to avoid awkward line stepping while
3411     // debugging. Eg. if the compare has got a line number inside the loop.
3412     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3413   }
3414 
3415   BranchInst *BrInst =
3416       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3417   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3418   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3419 
3420   // Get ready to start creating new instructions into the vectorized body.
3421   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3422          "Inconsistent vector loop preheader");
3423   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3424 
3425   Optional<MDNode *> VectorizedLoopID =
3426       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3427                                       LLVMLoopVectorizeFollowupVectorized});
3428   if (VectorizedLoopID.hasValue()) {
3429     L->setLoopID(VectorizedLoopID.getValue());
3430 
3431     // Do not setAlreadyVectorized if loop attributes have been defined
3432     // explicitly.
3433     return LoopVectorPreHeader;
3434   }
3435 
3436   // Keep all loop hints from the original loop on the vector loop (we'll
3437   // replace the vectorizer-specific hints below).
3438   if (MDNode *LID = OrigLoop->getLoopID())
3439     L->setLoopID(LID);
3440 
3441   LoopVectorizeHints Hints(L, true, *ORE);
3442   Hints.setAlreadyVectorized();
3443 
3444 #ifdef EXPENSIVE_CHECKS
3445   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3446   LI->verify(*DT);
3447 #endif
3448 
3449   return LoopVectorPreHeader;
3450 }
3451 
3452 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3453   /*
3454    In this function we generate a new loop. The new loop will contain
3455    the vectorized instructions while the old loop will continue to run the
3456    scalar remainder.
3457 
3458        [ ] <-- loop iteration number check.
3459     /   |
3460    /    v
3461   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3462   |  /  |
3463   | /   v
3464   ||   [ ]     <-- vector pre header.
3465   |/    |
3466   |     v
3467   |    [  ] \
3468   |    [  ]_|   <-- vector loop.
3469   |     |
3470   |     v
3471   |   -[ ]   <--- middle-block.
3472   |  /  |
3473   | /   v
3474   -|- >[ ]     <--- new preheader.
3475    |    |
3476    |    v
3477    |   [ ] \
3478    |   [ ]_|   <-- old scalar loop to handle remainder.
3479     \   |
3480      \  v
3481       >[ ]     <-- exit block.
3482    ...
3483    */
3484 
3485   // Get the metadata of the original loop before it gets modified.
3486   MDNode *OrigLoopID = OrigLoop->getLoopID();
3487 
3488   // Create an empty vector loop, and prepare basic blocks for the runtime
3489   // checks.
3490   Loop *Lp = createVectorLoopSkeleton("");
3491 
3492   // Now, compare the new count to zero. If it is zero skip the vector loop and
3493   // jump to the scalar loop. This check also covers the case where the
3494   // backedge-taken count is uint##_max: adding one to it will overflow leading
3495   // to an incorrect trip count of zero. In this (rare) case we will also jump
3496   // to the scalar loop.
3497   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3498 
3499   // Generate the code to check any assumptions that we've made for SCEV
3500   // expressions.
3501   emitSCEVChecks(Lp, LoopScalarPreHeader);
3502 
3503   // Generate the code that checks in runtime if arrays overlap. We put the
3504   // checks into a separate block to make the more common case of few elements
3505   // faster.
3506   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3507 
3508   // Some loops have a single integer induction variable, while other loops
3509   // don't. One example is c++ iterators that often have multiple pointer
3510   // induction variables. In the code below we also support a case where we
3511   // don't have a single induction variable.
3512   //
3513   // We try to obtain an induction variable from the original loop as hard
3514   // as possible. However if we don't find one that:
3515   //   - is an integer
3516   //   - counts from zero, stepping by one
3517   //   - is the size of the widest induction variable type
3518   // then we create a new one.
3519   OldInduction = Legal->getPrimaryInduction();
3520   Type *IdxTy = Legal->getWidestInductionType();
3521   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3522   // The loop step is equal to the vectorization factor (num of SIMD elements)
3523   // times the unroll factor (num of SIMD instructions).
3524   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3525   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
3526   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3527   Induction =
3528       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3529                               getDebugLocFromInstOrOperands(OldInduction));
3530 
3531   // Emit phis for the new starting index of the scalar loop.
3532   createInductionResumeValues(Lp, CountRoundDown);
3533 
3534   return completeLoopSkeleton(Lp, OrigLoopID);
3535 }
3536 
3537 // Fix up external users of the induction variable. At this point, we are
3538 // in LCSSA form, with all external PHIs that use the IV having one input value,
3539 // coming from the remainder loop. We need those PHIs to also have a correct
3540 // value for the IV when arriving directly from the middle block.
3541 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3542                                        const InductionDescriptor &II,
3543                                        Value *CountRoundDown, Value *EndValue,
3544                                        BasicBlock *MiddleBlock) {
3545   // There are two kinds of external IV usages - those that use the value
3546   // computed in the last iteration (the PHI) and those that use the penultimate
3547   // value (the value that feeds into the phi from the loop latch).
3548   // We allow both, but they, obviously, have different values.
3549 
3550   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3551 
3552   DenseMap<Value *, Value *> MissingVals;
3553 
3554   // An external user of the last iteration's value should see the value that
3555   // the remainder loop uses to initialize its own IV.
3556   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3557   for (User *U : PostInc->users()) {
3558     Instruction *UI = cast<Instruction>(U);
3559     if (!OrigLoop->contains(UI)) {
3560       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3561       MissingVals[UI] = EndValue;
3562     }
3563   }
3564 
3565   // An external user of the penultimate value need to see EndValue - Step.
3566   // The simplest way to get this is to recompute it from the constituent SCEVs,
3567   // that is Start + (Step * (CRD - 1)).
3568   for (User *U : OrigPhi->users()) {
3569     auto *UI = cast<Instruction>(U);
3570     if (!OrigLoop->contains(UI)) {
3571       const DataLayout &DL =
3572           OrigLoop->getHeader()->getModule()->getDataLayout();
3573       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3574 
3575       IRBuilder<> B(MiddleBlock->getTerminator());
3576       Value *CountMinusOne = B.CreateSub(
3577           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3578       Value *CMO =
3579           !II.getStep()->getType()->isIntegerTy()
3580               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3581                              II.getStep()->getType())
3582               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3583       CMO->setName("cast.cmo");
3584       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3585       Escape->setName("ind.escape");
3586       MissingVals[UI] = Escape;
3587     }
3588   }
3589 
3590   for (auto &I : MissingVals) {
3591     PHINode *PHI = cast<PHINode>(I.first);
3592     // One corner case we have to handle is two IVs "chasing" each-other,
3593     // that is %IV2 = phi [...], [ %IV1, %latch ]
3594     // In this case, if IV1 has an external use, we need to avoid adding both
3595     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3596     // don't already have an incoming value for the middle block.
3597     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3598       PHI->addIncoming(I.second, MiddleBlock);
3599   }
3600 }
3601 
3602 namespace {
3603 
3604 struct CSEDenseMapInfo {
3605   static bool canHandle(const Instruction *I) {
3606     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3607            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3608   }
3609 
3610   static inline Instruction *getEmptyKey() {
3611     return DenseMapInfo<Instruction *>::getEmptyKey();
3612   }
3613 
3614   static inline Instruction *getTombstoneKey() {
3615     return DenseMapInfo<Instruction *>::getTombstoneKey();
3616   }
3617 
3618   static unsigned getHashValue(const Instruction *I) {
3619     assert(canHandle(I) && "Unknown instruction!");
3620     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3621                                                            I->value_op_end()));
3622   }
3623 
3624   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3625     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3626         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3627       return LHS == RHS;
3628     return LHS->isIdenticalTo(RHS);
3629   }
3630 };
3631 
3632 } // end anonymous namespace
3633 
3634 ///Perform cse of induction variable instructions.
3635 static void cse(BasicBlock *BB) {
3636   // Perform simple cse.
3637   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3638   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3639     Instruction *In = &*I++;
3640 
3641     if (!CSEDenseMapInfo::canHandle(In))
3642       continue;
3643 
3644     // Check if we can replace this instruction with any of the
3645     // visited instructions.
3646     if (Instruction *V = CSEMap.lookup(In)) {
3647       In->replaceAllUsesWith(V);
3648       In->eraseFromParent();
3649       continue;
3650     }
3651 
3652     CSEMap[In] = In;
3653   }
3654 }
3655 
3656 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3657                                                        ElementCount VF,
3658                                                        bool &NeedToScalarize) {
3659   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3660   Function *F = CI->getCalledFunction();
3661   Type *ScalarRetTy = CI->getType();
3662   SmallVector<Type *, 4> Tys, ScalarTys;
3663   for (auto &ArgOp : CI->arg_operands())
3664     ScalarTys.push_back(ArgOp->getType());
3665 
3666   // Estimate cost of scalarized vector call. The source operands are assumed
3667   // to be vectors, so we need to extract individual elements from there,
3668   // execute VF scalar calls, and then gather the result into the vector return
3669   // value.
3670   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3671                                                  TTI::TCK_RecipThroughput);
3672   if (VF.isScalar())
3673     return ScalarCallCost;
3674 
3675   // Compute corresponding vector type for return value and arguments.
3676   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3677   for (Type *ScalarTy : ScalarTys)
3678     Tys.push_back(ToVectorTy(ScalarTy, VF));
3679 
3680   // Compute costs of unpacking argument values for the scalar calls and
3681   // packing the return values to a vector.
3682   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3683 
3684   unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3685 
3686   // If we can't emit a vector call for this function, then the currently found
3687   // cost is the cost we need to return.
3688   NeedToScalarize = true;
3689   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3690   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3691 
3692   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3693     return Cost;
3694 
3695   // If the corresponding vector cost is cheaper, return its cost.
3696   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3697                                                  TTI::TCK_RecipThroughput);
3698   if (VectorCallCost < Cost) {
3699     NeedToScalarize = false;
3700     return VectorCallCost;
3701   }
3702   return Cost;
3703 }
3704 
3705 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3706                                                             ElementCount VF) {
3707   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3708   assert(ID && "Expected intrinsic call!");
3709 
3710   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3711   return TTI.getIntrinsicInstrCost(CostAttrs,
3712                                    TargetTransformInfo::TCK_RecipThroughput);
3713 }
3714 
3715 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3716   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3717   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3718   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3719 }
3720 
3721 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3722   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3723   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3724   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3725 }
3726 
3727 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3728   // For every instruction `I` in MinBWs, truncate the operands, create a
3729   // truncated version of `I` and reextend its result. InstCombine runs
3730   // later and will remove any ext/trunc pairs.
3731   SmallPtrSet<Value *, 4> Erased;
3732   for (const auto &KV : Cost->getMinimalBitwidths()) {
3733     // If the value wasn't vectorized, we must maintain the original scalar
3734     // type. The absence of the value from VectorLoopValueMap indicates that it
3735     // wasn't vectorized.
3736     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3737       continue;
3738     for (unsigned Part = 0; Part < UF; ++Part) {
3739       Value *I = getOrCreateVectorValue(KV.first, Part);
3740       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3741         continue;
3742       Type *OriginalTy = I->getType();
3743       Type *ScalarTruncatedTy =
3744           IntegerType::get(OriginalTy->getContext(), KV.second);
3745       auto *TruncatedTy = FixedVectorType::get(
3746           ScalarTruncatedTy,
3747           cast<FixedVectorType>(OriginalTy)->getNumElements());
3748       if (TruncatedTy == OriginalTy)
3749         continue;
3750 
3751       IRBuilder<> B(cast<Instruction>(I));
3752       auto ShrinkOperand = [&](Value *V) -> Value * {
3753         if (auto *ZI = dyn_cast<ZExtInst>(V))
3754           if (ZI->getSrcTy() == TruncatedTy)
3755             return ZI->getOperand(0);
3756         return B.CreateZExtOrTrunc(V, TruncatedTy);
3757       };
3758 
3759       // The actual instruction modification depends on the instruction type,
3760       // unfortunately.
3761       Value *NewI = nullptr;
3762       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3763         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3764                              ShrinkOperand(BO->getOperand(1)));
3765 
3766         // Any wrapping introduced by shrinking this operation shouldn't be
3767         // considered undefined behavior. So, we can't unconditionally copy
3768         // arithmetic wrapping flags to NewI.
3769         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3770       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3771         NewI =
3772             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3773                          ShrinkOperand(CI->getOperand(1)));
3774       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3775         NewI = B.CreateSelect(SI->getCondition(),
3776                               ShrinkOperand(SI->getTrueValue()),
3777                               ShrinkOperand(SI->getFalseValue()));
3778       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3779         switch (CI->getOpcode()) {
3780         default:
3781           llvm_unreachable("Unhandled cast!");
3782         case Instruction::Trunc:
3783           NewI = ShrinkOperand(CI->getOperand(0));
3784           break;
3785         case Instruction::SExt:
3786           NewI = B.CreateSExtOrTrunc(
3787               CI->getOperand(0),
3788               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3789           break;
3790         case Instruction::ZExt:
3791           NewI = B.CreateZExtOrTrunc(
3792               CI->getOperand(0),
3793               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3794           break;
3795         }
3796       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3797         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3798                              ->getNumElements();
3799         auto *O0 = B.CreateZExtOrTrunc(
3800             SI->getOperand(0),
3801             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3802         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3803                              ->getNumElements();
3804         auto *O1 = B.CreateZExtOrTrunc(
3805             SI->getOperand(1),
3806             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3807 
3808         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3809       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3810         // Don't do anything with the operands, just extend the result.
3811         continue;
3812       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3813         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3814                             ->getNumElements();
3815         auto *O0 = B.CreateZExtOrTrunc(
3816             IE->getOperand(0),
3817             FixedVectorType::get(ScalarTruncatedTy, Elements));
3818         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3819         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3820       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3821         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3822                             ->getNumElements();
3823         auto *O0 = B.CreateZExtOrTrunc(
3824             EE->getOperand(0),
3825             FixedVectorType::get(ScalarTruncatedTy, Elements));
3826         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3827       } else {
3828         // If we don't know what to do, be conservative and don't do anything.
3829         continue;
3830       }
3831 
3832       // Lastly, extend the result.
3833       NewI->takeName(cast<Instruction>(I));
3834       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3835       I->replaceAllUsesWith(Res);
3836       cast<Instruction>(I)->eraseFromParent();
3837       Erased.insert(I);
3838       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3839     }
3840   }
3841 
3842   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3843   for (const auto &KV : Cost->getMinimalBitwidths()) {
3844     // If the value wasn't vectorized, we must maintain the original scalar
3845     // type. The absence of the value from VectorLoopValueMap indicates that it
3846     // wasn't vectorized.
3847     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3848       continue;
3849     for (unsigned Part = 0; Part < UF; ++Part) {
3850       Value *I = getOrCreateVectorValue(KV.first, Part);
3851       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3852       if (Inst && Inst->use_empty()) {
3853         Value *NewI = Inst->getOperand(0);
3854         Inst->eraseFromParent();
3855         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3856       }
3857     }
3858   }
3859 }
3860 
3861 void InnerLoopVectorizer::fixVectorizedLoop() {
3862   // Insert truncates and extends for any truncated instructions as hints to
3863   // InstCombine.
3864   if (VF.isVector())
3865     truncateToMinimalBitwidths();
3866 
3867   // Fix widened non-induction PHIs by setting up the PHI operands.
3868   if (OrigPHIsToFix.size()) {
3869     assert(EnableVPlanNativePath &&
3870            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3871     fixNonInductionPHIs();
3872   }
3873 
3874   // At this point every instruction in the original loop is widened to a
3875   // vector form. Now we need to fix the recurrences in the loop. These PHI
3876   // nodes are currently empty because we did not want to introduce cycles.
3877   // This is the second stage of vectorizing recurrences.
3878   fixCrossIterationPHIs();
3879 
3880   // Forget the original basic block.
3881   PSE.getSE()->forgetLoop(OrigLoop);
3882 
3883   // Fix-up external users of the induction variables.
3884   for (auto &Entry : Legal->getInductionVars())
3885     fixupIVUsers(Entry.first, Entry.second,
3886                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3887                  IVEndValues[Entry.first], LoopMiddleBlock);
3888 
3889   fixLCSSAPHIs();
3890   for (Instruction *PI : PredicatedInstructions)
3891     sinkScalarOperands(&*PI);
3892 
3893   // Remove redundant induction instructions.
3894   cse(LoopVectorBody);
3895 
3896   // Set/update profile weights for the vector and remainder loops as original
3897   // loop iterations are now distributed among them. Note that original loop
3898   // represented by LoopScalarBody becomes remainder loop after vectorization.
3899   //
3900   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3901   // end up getting slightly roughened result but that should be OK since
3902   // profile is not inherently precise anyway. Note also possible bypass of
3903   // vector code caused by legality checks is ignored, assigning all the weight
3904   // to the vector loop, optimistically.
3905   assert(!VF.isScalable() &&
3906          "cannot use scalable ElementCount to determine unroll factor");
3907   setProfileInfoAfterUnrolling(
3908       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3909       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3910 }
3911 
3912 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3913   // In order to support recurrences we need to be able to vectorize Phi nodes.
3914   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3915   // stage #2: We now need to fix the recurrences by adding incoming edges to
3916   // the currently empty PHI nodes. At this point every instruction in the
3917   // original loop is widened to a vector form so we can use them to construct
3918   // the incoming edges.
3919   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3920     // Handle first-order recurrences and reductions that need to be fixed.
3921     if (Legal->isFirstOrderRecurrence(&Phi))
3922       fixFirstOrderRecurrence(&Phi);
3923     else if (Legal->isReductionVariable(&Phi))
3924       fixReduction(&Phi);
3925   }
3926 }
3927 
3928 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3929   // This is the second phase of vectorizing first-order recurrences. An
3930   // overview of the transformation is described below. Suppose we have the
3931   // following loop.
3932   //
3933   //   for (int i = 0; i < n; ++i)
3934   //     b[i] = a[i] - a[i - 1];
3935   //
3936   // There is a first-order recurrence on "a". For this loop, the shorthand
3937   // scalar IR looks like:
3938   //
3939   //   scalar.ph:
3940   //     s_init = a[-1]
3941   //     br scalar.body
3942   //
3943   //   scalar.body:
3944   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3945   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3946   //     s2 = a[i]
3947   //     b[i] = s2 - s1
3948   //     br cond, scalar.body, ...
3949   //
3950   // In this example, s1 is a recurrence because it's value depends on the
3951   // previous iteration. In the first phase of vectorization, we created a
3952   // temporary value for s1. We now complete the vectorization and produce the
3953   // shorthand vector IR shown below (for VF = 4, UF = 1).
3954   //
3955   //   vector.ph:
3956   //     v_init = vector(..., ..., ..., a[-1])
3957   //     br vector.body
3958   //
3959   //   vector.body
3960   //     i = phi [0, vector.ph], [i+4, vector.body]
3961   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3962   //     v2 = a[i, i+1, i+2, i+3];
3963   //     v3 = vector(v1(3), v2(0, 1, 2))
3964   //     b[i, i+1, i+2, i+3] = v2 - v3
3965   //     br cond, vector.body, middle.block
3966   //
3967   //   middle.block:
3968   //     x = v2(3)
3969   //     br scalar.ph
3970   //
3971   //   scalar.ph:
3972   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3973   //     br scalar.body
3974   //
3975   // After execution completes the vector loop, we extract the next value of
3976   // the recurrence (x) to use as the initial value in the scalar loop.
3977 
3978   // Get the original loop preheader and single loop latch.
3979   auto *Preheader = OrigLoop->getLoopPreheader();
3980   auto *Latch = OrigLoop->getLoopLatch();
3981 
3982   // Get the initial and previous values of the scalar recurrence.
3983   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3984   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3985 
3986   // Create a vector from the initial value.
3987   auto *VectorInit = ScalarInit;
3988   if (VF.isVector()) {
3989     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3990     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
3991     VectorInit = Builder.CreateInsertElement(
3992         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3993         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
3994   }
3995 
3996   // We constructed a temporary phi node in the first phase of vectorization.
3997   // This phi node will eventually be deleted.
3998   Builder.SetInsertPoint(
3999       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4000 
4001   // Create a phi node for the new recurrence. The current value will either be
4002   // the initial value inserted into a vector or loop-varying vector value.
4003   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4004   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4005 
4006   // Get the vectorized previous value of the last part UF - 1. It appears last
4007   // among all unrolled iterations, due to the order of their construction.
4008   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4009 
4010   // Find and set the insertion point after the previous value if it is an
4011   // instruction.
4012   BasicBlock::iterator InsertPt;
4013   // Note that the previous value may have been constant-folded so it is not
4014   // guaranteed to be an instruction in the vector loop.
4015   // FIXME: Loop invariant values do not form recurrences. We should deal with
4016   //        them earlier.
4017   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4018     InsertPt = LoopVectorBody->getFirstInsertionPt();
4019   else {
4020     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4021     if (isa<PHINode>(PreviousLastPart))
4022       // If the previous value is a phi node, we should insert after all the phi
4023       // nodes in the block containing the PHI to avoid breaking basic block
4024       // verification. Note that the basic block may be different to
4025       // LoopVectorBody, in case we predicate the loop.
4026       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4027     else
4028       InsertPt = ++PreviousInst->getIterator();
4029   }
4030   Builder.SetInsertPoint(&*InsertPt);
4031 
4032   // We will construct a vector for the recurrence by combining the values for
4033   // the current and previous iterations. This is the required shuffle mask.
4034   assert(!VF.isScalable());
4035   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4036   ShuffleMask[0] = VF.getKnownMinValue() - 1;
4037   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4038     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4039 
4040   // The vector from which to take the initial value for the current iteration
4041   // (actual or unrolled). Initially, this is the vector phi node.
4042   Value *Incoming = VecPhi;
4043 
4044   // Shuffle the current and previous vector and update the vector parts.
4045   for (unsigned Part = 0; Part < UF; ++Part) {
4046     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4047     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4048     auto *Shuffle =
4049         VF.isVector()
4050             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4051             : Incoming;
4052     PhiPart->replaceAllUsesWith(Shuffle);
4053     cast<Instruction>(PhiPart)->eraseFromParent();
4054     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4055     Incoming = PreviousPart;
4056   }
4057 
4058   // Fix the latch value of the new recurrence in the vector loop.
4059   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4060 
4061   // Extract the last vector element in the middle block. This will be the
4062   // initial value for the recurrence when jumping to the scalar loop.
4063   auto *ExtractForScalar = Incoming;
4064   if (VF.isVector()) {
4065     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4066     ExtractForScalar = Builder.CreateExtractElement(
4067         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4068         "vector.recur.extract");
4069   }
4070   // Extract the second last element in the middle block if the
4071   // Phi is used outside the loop. We need to extract the phi itself
4072   // and not the last element (the phi update in the current iteration). This
4073   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4074   // when the scalar loop is not run at all.
4075   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4076   if (VF.isVector())
4077     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4078         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4079         "vector.recur.extract.for.phi");
4080   // When loop is unrolled without vectorizing, initialize
4081   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4082   // `Incoming`. This is analogous to the vectorized case above: extracting the
4083   // second last element when VF > 1.
4084   else if (UF > 1)
4085     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4086 
4087   // Fix the initial value of the original recurrence in the scalar loop.
4088   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4089   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4090   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4091     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4092     Start->addIncoming(Incoming, BB);
4093   }
4094 
4095   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4096   Phi->setName("scalar.recur");
4097 
4098   // Finally, fix users of the recurrence outside the loop. The users will need
4099   // either the last value of the scalar recurrence or the last value of the
4100   // vector recurrence we extracted in the middle block. Since the loop is in
4101   // LCSSA form, we just need to find all the phi nodes for the original scalar
4102   // recurrence in the exit block, and then add an edge for the middle block.
4103   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4104     if (LCSSAPhi.getIncomingValue(0) == Phi) {
4105       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4106     }
4107   }
4108 }
4109 
4110 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
4111   Constant *Zero = Builder.getInt32(0);
4112 
4113   // Get it's reduction variable descriptor.
4114   assert(Legal->isReductionVariable(Phi) &&
4115          "Unable to find the reduction variable");
4116   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4117 
4118   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4119   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4120   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4121   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
4122     RdxDesc.getMinMaxRecurrenceKind();
4123   setDebugLocFromInst(Builder, ReductionStartValue);
4124   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4125 
4126   // We need to generate a reduction vector from the incoming scalar.
4127   // To do so, we need to generate the 'identity' vector and override
4128   // one of the elements with the incoming scalar reduction. We need
4129   // to do it in the vector-loop preheader.
4130   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4131 
4132   // This is the vector-clone of the value that leaves the loop.
4133   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4134 
4135   // Find the reduction identity variable. Zero for addition, or, xor,
4136   // one for multiplication, -1 for And.
4137   Value *Identity;
4138   Value *VectorStart;
4139   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
4140       RK == RecurrenceDescriptor::RK_FloatMinMax) {
4141     // MinMax reduction have the start value as their identify.
4142     if (VF.isScalar() || IsInLoopReductionPhi) {
4143       VectorStart = Identity = ReductionStartValue;
4144     } else {
4145       VectorStart = Identity =
4146         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
4147     }
4148   } else {
4149     // Handle other reduction kinds:
4150     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
4151         RK, MinMaxKind, VecTy->getScalarType());
4152     if (VF.isScalar() || IsInLoopReductionPhi) {
4153       Identity = Iden;
4154       // This vector is the Identity vector where the first element is the
4155       // incoming scalar reduction.
4156       VectorStart = ReductionStartValue;
4157     } else {
4158       Identity = ConstantVector::getSplat(VF, Iden);
4159 
4160       // This vector is the Identity vector where the first element is the
4161       // incoming scalar reduction.
4162       VectorStart =
4163         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
4164     }
4165   }
4166 
4167   // Wrap flags are in general invalid after vectorization, clear them.
4168   clearReductionWrapFlags(RdxDesc);
4169 
4170   // Fix the vector-loop phi.
4171 
4172   // Reductions do not have to start at zero. They can start with
4173   // any loop invariant values.
4174   BasicBlock *Latch = OrigLoop->getLoopLatch();
4175   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4176 
4177   for (unsigned Part = 0; Part < UF; ++Part) {
4178     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4179     Value *Val = getOrCreateVectorValue(LoopVal, Part);
4180     // Make sure to add the reduction start value only to the
4181     // first unroll part.
4182     Value *StartVal = (Part == 0) ? VectorStart : Identity;
4183     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
4184     cast<PHINode>(VecRdxPhi)
4185       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4186   }
4187 
4188   // Before each round, move the insertion point right between
4189   // the PHIs and the values we are going to write.
4190   // This allows us to write both PHINodes and the extractelement
4191   // instructions.
4192   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4193 
4194   setDebugLocFromInst(Builder, LoopExitInst);
4195 
4196   // If tail is folded by masking, the vector value to leave the loop should be
4197   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4198   // instead of the former. For an inloop reduction the reduction will already
4199   // be predicated, and does not need to be handled here.
4200   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4201     for (unsigned Part = 0; Part < UF; ++Part) {
4202       Value *VecLoopExitInst =
4203           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4204       Value *Sel = nullptr;
4205       for (User *U : VecLoopExitInst->users()) {
4206         if (isa<SelectInst>(U)) {
4207           assert(!Sel && "Reduction exit feeding two selects");
4208           Sel = U;
4209         } else
4210           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4211       }
4212       assert(Sel && "Reduction exit feeds no select");
4213       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4214 
4215       // If the target can create a predicated operator for the reduction at no
4216       // extra cost in the loop (for example a predicated vadd), it can be
4217       // cheaper for the select to remain in the loop than be sunk out of it,
4218       // and so use the select value for the phi instead of the old
4219       // LoopExitValue.
4220       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4221       if (PreferPredicatedReductionSelect ||
4222           TTI->preferPredicatedReductionSelect(
4223               RdxDesc.getRecurrenceBinOp(), Phi->getType(),
4224               TargetTransformInfo::ReductionFlags())) {
4225         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4226         VecRdxPhi->setIncomingValueForBlock(
4227             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4228       }
4229     }
4230   }
4231 
4232   // If the vector reduction can be performed in a smaller type, we truncate
4233   // then extend the loop exit value to enable InstCombine to evaluate the
4234   // entire expression in the smaller type.
4235   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4236     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4237     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4238     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4239     Builder.SetInsertPoint(
4240         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4241     VectorParts RdxParts(UF);
4242     for (unsigned Part = 0; Part < UF; ++Part) {
4243       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4244       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4245       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4246                                         : Builder.CreateZExt(Trunc, VecTy);
4247       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4248            UI != RdxParts[Part]->user_end();)
4249         if (*UI != Trunc) {
4250           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4251           RdxParts[Part] = Extnd;
4252         } else {
4253           ++UI;
4254         }
4255     }
4256     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4257     for (unsigned Part = 0; Part < UF; ++Part) {
4258       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4259       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4260     }
4261   }
4262 
4263   // Reduce all of the unrolled parts into a single vector.
4264   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4265   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4266 
4267   // The middle block terminator has already been assigned a DebugLoc here (the
4268   // OrigLoop's single latch terminator). We want the whole middle block to
4269   // appear to execute on this line because: (a) it is all compiler generated,
4270   // (b) these instructions are always executed after evaluating the latch
4271   // conditional branch, and (c) other passes may add new predecessors which
4272   // terminate on this line. This is the easiest way to ensure we don't
4273   // accidentally cause an extra step back into the loop while debugging.
4274   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4275   for (unsigned Part = 1; Part < UF; ++Part) {
4276     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4277     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4278       // Floating point operations had to be 'fast' to enable the reduction.
4279       ReducedPartRdx = addFastMathFlag(
4280           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4281                               ReducedPartRdx, "bin.rdx"),
4282           RdxDesc.getFastMathFlags());
4283     else
4284       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4285                                       RdxPart);
4286   }
4287 
4288   // Create the reduction after the loop. Note that inloop reductions create the
4289   // target reduction in the loop using a Reduction recipe.
4290   if (VF.isVector() && !IsInLoopReductionPhi) {
4291     bool NoNaN = Legal->hasFunNoNaNAttr();
4292     ReducedPartRdx =
4293         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4294     // If the reduction can be performed in a smaller type, we need to extend
4295     // the reduction to the wider type before we branch to the original loop.
4296     if (Phi->getType() != RdxDesc.getRecurrenceType())
4297       ReducedPartRdx =
4298         RdxDesc.isSigned()
4299         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4300         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4301   }
4302 
4303   // Create a phi node that merges control-flow from the backedge-taken check
4304   // block and the middle block.
4305   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4306                                         LoopScalarPreHeader->getTerminator());
4307   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4308     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4309   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4310 
4311   // Now, we need to fix the users of the reduction variable
4312   // inside and outside of the scalar remainder loop.
4313   // We know that the loop is in LCSSA form. We need to update the
4314   // PHI nodes in the exit blocks.
4315   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4316     // All PHINodes need to have a single entry edge, or two if
4317     // we already fixed them.
4318     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4319 
4320     // We found a reduction value exit-PHI. Update it with the
4321     // incoming bypass edge.
4322     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4323       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4324   } // end of the LCSSA phi scan.
4325 
4326     // Fix the scalar loop reduction variable with the incoming reduction sum
4327     // from the vector body and from the backedge value.
4328   int IncomingEdgeBlockIdx =
4329     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4330   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4331   // Pick the other block.
4332   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4333   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4334   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4335 }
4336 
4337 void InnerLoopVectorizer::clearReductionWrapFlags(
4338     RecurrenceDescriptor &RdxDesc) {
4339   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4340   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4341       RK != RecurrenceDescriptor::RK_IntegerMult)
4342     return;
4343 
4344   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4345   assert(LoopExitInstr && "null loop exit instruction");
4346   SmallVector<Instruction *, 8> Worklist;
4347   SmallPtrSet<Instruction *, 8> Visited;
4348   Worklist.push_back(LoopExitInstr);
4349   Visited.insert(LoopExitInstr);
4350 
4351   while (!Worklist.empty()) {
4352     Instruction *Cur = Worklist.pop_back_val();
4353     if (isa<OverflowingBinaryOperator>(Cur))
4354       for (unsigned Part = 0; Part < UF; ++Part) {
4355         Value *V = getOrCreateVectorValue(Cur, Part);
4356         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4357       }
4358 
4359     for (User *U : Cur->users()) {
4360       Instruction *UI = cast<Instruction>(U);
4361       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4362           Visited.insert(UI).second)
4363         Worklist.push_back(UI);
4364     }
4365   }
4366 }
4367 
4368 void InnerLoopVectorizer::fixLCSSAPHIs() {
4369   assert(!VF.isScalable() && "the code below assumes fixed width vectors");
4370   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4371     if (LCSSAPhi.getNumIncomingValues() == 1) {
4372       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4373       // Non-instruction incoming values will have only one value.
4374       unsigned LastLane = 0;
4375       if (isa<Instruction>(IncomingValue))
4376         LastLane = Cost->isUniformAfterVectorization(
4377                        cast<Instruction>(IncomingValue), VF)
4378                        ? 0
4379                        : VF.getKnownMinValue() - 1;
4380       // Can be a loop invariant incoming value or the last scalar value to be
4381       // extracted from the vectorized loop.
4382       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4383       Value *lastIncomingValue =
4384           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4385       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4386     }
4387   }
4388 }
4389 
4390 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4391   // The basic block and loop containing the predicated instruction.
4392   auto *PredBB = PredInst->getParent();
4393   auto *VectorLoop = LI->getLoopFor(PredBB);
4394 
4395   // Initialize a worklist with the operands of the predicated instruction.
4396   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4397 
4398   // Holds instructions that we need to analyze again. An instruction may be
4399   // reanalyzed if we don't yet know if we can sink it or not.
4400   SmallVector<Instruction *, 8> InstsToReanalyze;
4401 
4402   // Returns true if a given use occurs in the predicated block. Phi nodes use
4403   // their operands in their corresponding predecessor blocks.
4404   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4405     auto *I = cast<Instruction>(U.getUser());
4406     BasicBlock *BB = I->getParent();
4407     if (auto *Phi = dyn_cast<PHINode>(I))
4408       BB = Phi->getIncomingBlock(
4409           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4410     return BB == PredBB;
4411   };
4412 
4413   // Iteratively sink the scalarized operands of the predicated instruction
4414   // into the block we created for it. When an instruction is sunk, it's
4415   // operands are then added to the worklist. The algorithm ends after one pass
4416   // through the worklist doesn't sink a single instruction.
4417   bool Changed;
4418   do {
4419     // Add the instructions that need to be reanalyzed to the worklist, and
4420     // reset the changed indicator.
4421     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4422     InstsToReanalyze.clear();
4423     Changed = false;
4424 
4425     while (!Worklist.empty()) {
4426       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4427 
4428       // We can't sink an instruction if it is a phi node, is already in the
4429       // predicated block, is not in the loop, or may have side effects.
4430       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4431           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4432         continue;
4433 
4434       // It's legal to sink the instruction if all its uses occur in the
4435       // predicated block. Otherwise, there's nothing to do yet, and we may
4436       // need to reanalyze the instruction.
4437       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4438         InstsToReanalyze.push_back(I);
4439         continue;
4440       }
4441 
4442       // Move the instruction to the beginning of the predicated block, and add
4443       // it's operands to the worklist.
4444       I->moveBefore(&*PredBB->getFirstInsertionPt());
4445       Worklist.insert(I->op_begin(), I->op_end());
4446 
4447       // The sinking may have enabled other instructions to be sunk, so we will
4448       // need to iterate.
4449       Changed = true;
4450     }
4451   } while (Changed);
4452 }
4453 
4454 void InnerLoopVectorizer::fixNonInductionPHIs() {
4455   for (PHINode *OrigPhi : OrigPHIsToFix) {
4456     PHINode *NewPhi =
4457         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4458     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4459 
4460     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4461         predecessors(OrigPhi->getParent()));
4462     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4463         predecessors(NewPhi->getParent()));
4464     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4465            "Scalar and Vector BB should have the same number of predecessors");
4466 
4467     // The insertion point in Builder may be invalidated by the time we get
4468     // here. Force the Builder insertion point to something valid so that we do
4469     // not run into issues during insertion point restore in
4470     // getOrCreateVectorValue calls below.
4471     Builder.SetInsertPoint(NewPhi);
4472 
4473     // The predecessor order is preserved and we can rely on mapping between
4474     // scalar and vector block predecessors.
4475     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4476       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4477 
4478       // When looking up the new scalar/vector values to fix up, use incoming
4479       // values from original phi.
4480       Value *ScIncV =
4481           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4482 
4483       // Scalar incoming value may need a broadcast
4484       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4485       NewPhi->addIncoming(NewIncV, NewPredBB);
4486     }
4487   }
4488 }
4489 
4490 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4491                                    VPUser &Operands, unsigned UF,
4492                                    ElementCount VF, bool IsPtrLoopInvariant,
4493                                    SmallBitVector &IsIndexLoopInvariant,
4494                                    VPTransformState &State) {
4495   // Construct a vector GEP by widening the operands of the scalar GEP as
4496   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4497   // results in a vector of pointers when at least one operand of the GEP
4498   // is vector-typed. Thus, to keep the representation compact, we only use
4499   // vector-typed operands for loop-varying values.
4500 
4501   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4502     // If we are vectorizing, but the GEP has only loop-invariant operands,
4503     // the GEP we build (by only using vector-typed operands for
4504     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4505     // produce a vector of pointers, we need to either arbitrarily pick an
4506     // operand to broadcast, or broadcast a clone of the original GEP.
4507     // Here, we broadcast a clone of the original.
4508     //
4509     // TODO: If at some point we decide to scalarize instructions having
4510     //       loop-invariant operands, this special case will no longer be
4511     //       required. We would add the scalarization decision to
4512     //       collectLoopScalars() and teach getVectorValue() to broadcast
4513     //       the lane-zero scalar value.
4514     auto *Clone = Builder.Insert(GEP->clone());
4515     for (unsigned Part = 0; Part < UF; ++Part) {
4516       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4517       State.set(VPDef, GEP, EntryPart, Part);
4518       addMetadata(EntryPart, GEP);
4519     }
4520   } else {
4521     // If the GEP has at least one loop-varying operand, we are sure to
4522     // produce a vector of pointers. But if we are only unrolling, we want
4523     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4524     // produce with the code below will be scalar (if VF == 1) or vector
4525     // (otherwise). Note that for the unroll-only case, we still maintain
4526     // values in the vector mapping with initVector, as we do for other
4527     // instructions.
4528     for (unsigned Part = 0; Part < UF; ++Part) {
4529       // The pointer operand of the new GEP. If it's loop-invariant, we
4530       // won't broadcast it.
4531       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4532                                      : State.get(Operands.getOperand(0), Part);
4533 
4534       // Collect all the indices for the new GEP. If any index is
4535       // loop-invariant, we won't broadcast it.
4536       SmallVector<Value *, 4> Indices;
4537       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4538         VPValue *Operand = Operands.getOperand(I);
4539         if (IsIndexLoopInvariant[I - 1])
4540           Indices.push_back(State.get(Operand, {0, 0}));
4541         else
4542           Indices.push_back(State.get(Operand, Part));
4543       }
4544 
4545       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4546       // but it should be a vector, otherwise.
4547       auto *NewGEP =
4548           GEP->isInBounds()
4549               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4550                                           Indices)
4551               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4552       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4553              "NewGEP is not a pointer vector");
4554       State.set(VPDef, GEP, NewGEP, Part);
4555       addMetadata(NewGEP, GEP);
4556     }
4557   }
4558 }
4559 
4560 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4561                                               ElementCount VF) {
4562   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4563   PHINode *P = cast<PHINode>(PN);
4564   if (EnableVPlanNativePath) {
4565     // Currently we enter here in the VPlan-native path for non-induction
4566     // PHIs where all control flow is uniform. We simply widen these PHIs.
4567     // Create a vector phi with no operands - the vector phi operands will be
4568     // set at the end of vector code generation.
4569     Type *VecTy =
4570         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4571     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4572     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4573     OrigPHIsToFix.push_back(P);
4574 
4575     return;
4576   }
4577 
4578   assert(PN->getParent() == OrigLoop->getHeader() &&
4579          "Non-header phis should have been handled elsewhere");
4580 
4581   // In order to support recurrences we need to be able to vectorize Phi nodes.
4582   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4583   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4584   // this value when we vectorize all of the instructions that use the PHI.
4585   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4586     for (unsigned Part = 0; Part < UF; ++Part) {
4587       // This is phase one of vectorizing PHIs.
4588       bool ScalarPHI =
4589           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4590       Type *VecTy =
4591           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4592       Value *EntryPart = PHINode::Create(
4593           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4594       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4595     }
4596     return;
4597   }
4598 
4599   setDebugLocFromInst(Builder, P);
4600 
4601   // This PHINode must be an induction variable.
4602   // Make sure that we know about it.
4603   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4604 
4605   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4606   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4607 
4608   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4609   // which can be found from the original scalar operations.
4610   switch (II.getKind()) {
4611   case InductionDescriptor::IK_NoInduction:
4612     llvm_unreachable("Unknown induction");
4613   case InductionDescriptor::IK_IntInduction:
4614   case InductionDescriptor::IK_FpInduction:
4615     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4616   case InductionDescriptor::IK_PtrInduction: {
4617     // Handle the pointer induction variable case.
4618     assert(P->getType()->isPointerTy() && "Unexpected type.");
4619 
4620     if (Cost->isScalarAfterVectorization(P, VF)) {
4621       // This is the normalized GEP that starts counting at zero.
4622       Value *PtrInd =
4623           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4624       // Determine the number of scalars we need to generate for each unroll
4625       // iteration. If the instruction is uniform, we only need to generate the
4626       // first lane. Otherwise, we generate all VF values.
4627       unsigned Lanes =
4628           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4629       for (unsigned Part = 0; Part < UF; ++Part) {
4630         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4631           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4632                                            Lane + Part * VF.getKnownMinValue());
4633           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4634           Value *SclrGep =
4635               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4636           SclrGep->setName("next.gep");
4637           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4638         }
4639       }
4640       return;
4641     }
4642     assert(isa<SCEVConstant>(II.getStep()) &&
4643            "Induction step not a SCEV constant!");
4644     Type *PhiType = II.getStep()->getType();
4645 
4646     // Build a pointer phi
4647     Value *ScalarStartValue = II.getStartValue();
4648     Type *ScStValueType = ScalarStartValue->getType();
4649     PHINode *NewPointerPhi =
4650         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4651     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4652 
4653     // A pointer induction, performed by using a gep
4654     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4655     Instruction *InductionLoc = LoopLatch->getTerminator();
4656     const SCEV *ScalarStep = II.getStep();
4657     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4658     Value *ScalarStepValue =
4659         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4660     Value *InductionGEP = GetElementPtrInst::Create(
4661         ScStValueType->getPointerElementType(), NewPointerPhi,
4662         Builder.CreateMul(
4663             ScalarStepValue,
4664             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4665         "ptr.ind", InductionLoc);
4666     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4667 
4668     // Create UF many actual address geps that use the pointer
4669     // phi as base and a vectorized version of the step value
4670     // (<step*0, ..., step*N>) as offset.
4671     for (unsigned Part = 0; Part < UF; ++Part) {
4672       SmallVector<Constant *, 8> Indices;
4673       // Create a vector of consecutive numbers from zero to VF.
4674       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4675         Indices.push_back(
4676             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4677       Constant *StartOffset = ConstantVector::get(Indices);
4678 
4679       Value *GEP = Builder.CreateGEP(
4680           ScStValueType->getPointerElementType(), NewPointerPhi,
4681           Builder.CreateMul(
4682               StartOffset,
4683               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4684               "vector.gep"));
4685       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4686     }
4687   }
4688   }
4689 }
4690 
4691 /// A helper function for checking whether an integer division-related
4692 /// instruction may divide by zero (in which case it must be predicated if
4693 /// executed conditionally in the scalar code).
4694 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4695 /// Non-zero divisors that are non compile-time constants will not be
4696 /// converted into multiplication, so we will still end up scalarizing
4697 /// the division, but can do so w/o predication.
4698 static bool mayDivideByZero(Instruction &I) {
4699   assert((I.getOpcode() == Instruction::UDiv ||
4700           I.getOpcode() == Instruction::SDiv ||
4701           I.getOpcode() == Instruction::URem ||
4702           I.getOpcode() == Instruction::SRem) &&
4703          "Unexpected instruction");
4704   Value *Divisor = I.getOperand(1);
4705   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4706   return !CInt || CInt->isZero();
4707 }
4708 
4709 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4710                                            VPUser &User,
4711                                            VPTransformState &State) {
4712   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4713   switch (I.getOpcode()) {
4714   case Instruction::Call:
4715   case Instruction::Br:
4716   case Instruction::PHI:
4717   case Instruction::GetElementPtr:
4718   case Instruction::Select:
4719     llvm_unreachable("This instruction is handled by a different recipe.");
4720   case Instruction::UDiv:
4721   case Instruction::SDiv:
4722   case Instruction::SRem:
4723   case Instruction::URem:
4724   case Instruction::Add:
4725   case Instruction::FAdd:
4726   case Instruction::Sub:
4727   case Instruction::FSub:
4728   case Instruction::FNeg:
4729   case Instruction::Mul:
4730   case Instruction::FMul:
4731   case Instruction::FDiv:
4732   case Instruction::FRem:
4733   case Instruction::Shl:
4734   case Instruction::LShr:
4735   case Instruction::AShr:
4736   case Instruction::And:
4737   case Instruction::Or:
4738   case Instruction::Xor: {
4739     // Just widen unops and binops.
4740     setDebugLocFromInst(Builder, &I);
4741 
4742     for (unsigned Part = 0; Part < UF; ++Part) {
4743       SmallVector<Value *, 2> Ops;
4744       for (VPValue *VPOp : User.operands())
4745         Ops.push_back(State.get(VPOp, Part));
4746 
4747       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4748 
4749       if (auto *VecOp = dyn_cast<Instruction>(V))
4750         VecOp->copyIRFlags(&I);
4751 
4752       // Use this vector value for all users of the original instruction.
4753       State.set(Def, &I, V, Part);
4754       addMetadata(V, &I);
4755     }
4756 
4757     break;
4758   }
4759   case Instruction::ICmp:
4760   case Instruction::FCmp: {
4761     // Widen compares. Generate vector compares.
4762     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4763     auto *Cmp = cast<CmpInst>(&I);
4764     setDebugLocFromInst(Builder, Cmp);
4765     for (unsigned Part = 0; Part < UF; ++Part) {
4766       Value *A = State.get(User.getOperand(0), Part);
4767       Value *B = State.get(User.getOperand(1), Part);
4768       Value *C = nullptr;
4769       if (FCmp) {
4770         // Propagate fast math flags.
4771         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4772         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4773         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4774       } else {
4775         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4776       }
4777       State.set(Def, &I, C, Part);
4778       addMetadata(C, &I);
4779     }
4780 
4781     break;
4782   }
4783 
4784   case Instruction::ZExt:
4785   case Instruction::SExt:
4786   case Instruction::FPToUI:
4787   case Instruction::FPToSI:
4788   case Instruction::FPExt:
4789   case Instruction::PtrToInt:
4790   case Instruction::IntToPtr:
4791   case Instruction::SIToFP:
4792   case Instruction::UIToFP:
4793   case Instruction::Trunc:
4794   case Instruction::FPTrunc:
4795   case Instruction::BitCast: {
4796     auto *CI = cast<CastInst>(&I);
4797     setDebugLocFromInst(Builder, CI);
4798 
4799     /// Vectorize casts.
4800     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4801     Type *DestTy =
4802         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4803 
4804     for (unsigned Part = 0; Part < UF; ++Part) {
4805       Value *A = State.get(User.getOperand(0), Part);
4806       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4807       State.set(Def, &I, Cast, Part);
4808       addMetadata(Cast, &I);
4809     }
4810     break;
4811   }
4812   default:
4813     // This instruction is not vectorized by simple widening.
4814     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4815     llvm_unreachable("Unhandled instruction!");
4816   } // end of switch.
4817 }
4818 
4819 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4820                                                VPUser &ArgOperands,
4821                                                VPTransformState &State) {
4822   assert(!isa<DbgInfoIntrinsic>(I) &&
4823          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4824   setDebugLocFromInst(Builder, &I);
4825 
4826   Module *M = I.getParent()->getParent()->getParent();
4827   auto *CI = cast<CallInst>(&I);
4828 
4829   SmallVector<Type *, 4> Tys;
4830   for (Value *ArgOperand : CI->arg_operands())
4831     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4832 
4833   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4834 
4835   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4836   // version of the instruction.
4837   // Is it beneficial to perform intrinsic call compared to lib call?
4838   bool NeedToScalarize = false;
4839   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4840   bool UseVectorIntrinsic =
4841       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4842   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4843          "Instruction should be scalarized elsewhere.");
4844 
4845   for (unsigned Part = 0; Part < UF; ++Part) {
4846     SmallVector<Value *, 4> Args;
4847     for (auto &I : enumerate(ArgOperands.operands())) {
4848       // Some intrinsics have a scalar argument - don't replace it with a
4849       // vector.
4850       Value *Arg;
4851       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4852         Arg = State.get(I.value(), Part);
4853       else
4854         Arg = State.get(I.value(), {0, 0});
4855       Args.push_back(Arg);
4856     }
4857 
4858     Function *VectorF;
4859     if (UseVectorIntrinsic) {
4860       // Use vector version of the intrinsic.
4861       Type *TysForDecl[] = {CI->getType()};
4862       if (VF.isVector()) {
4863         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4864         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4865       }
4866       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4867       assert(VectorF && "Can't retrieve vector intrinsic.");
4868     } else {
4869       // Use vector version of the function call.
4870       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4871 #ifndef NDEBUG
4872       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4873              "Can't create vector function.");
4874 #endif
4875         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4876     }
4877       SmallVector<OperandBundleDef, 1> OpBundles;
4878       CI->getOperandBundlesAsDefs(OpBundles);
4879       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4880 
4881       if (isa<FPMathOperator>(V))
4882         V->copyFastMathFlags(CI);
4883 
4884       State.set(Def, &I, V, Part);
4885       addMetadata(V, &I);
4886   }
4887 }
4888 
4889 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4890                                                  VPUser &Operands,
4891                                                  bool InvariantCond,
4892                                                  VPTransformState &State) {
4893   setDebugLocFromInst(Builder, &I);
4894 
4895   // The condition can be loop invariant  but still defined inside the
4896   // loop. This means that we can't just use the original 'cond' value.
4897   // We have to take the 'vectorized' value and pick the first lane.
4898   // Instcombine will make this a no-op.
4899   auto *InvarCond =
4900       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4901 
4902   for (unsigned Part = 0; Part < UF; ++Part) {
4903     Value *Cond =
4904         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4905     Value *Op0 = State.get(Operands.getOperand(1), Part);
4906     Value *Op1 = State.get(Operands.getOperand(2), Part);
4907     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4908     State.set(VPDef, &I, Sel, Part);
4909     addMetadata(Sel, &I);
4910   }
4911 }
4912 
4913 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4914   // We should not collect Scalars more than once per VF. Right now, this
4915   // function is called from collectUniformsAndScalars(), which already does
4916   // this check. Collecting Scalars for VF=1 does not make any sense.
4917   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4918          "This function should not be visited twice for the same VF");
4919 
4920   SmallSetVector<Instruction *, 8> Worklist;
4921 
4922   // These sets are used to seed the analysis with pointers used by memory
4923   // accesses that will remain scalar.
4924   SmallSetVector<Instruction *, 8> ScalarPtrs;
4925   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4926   auto *Latch = TheLoop->getLoopLatch();
4927 
4928   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4929   // The pointer operands of loads and stores will be scalar as long as the
4930   // memory access is not a gather or scatter operation. The value operand of a
4931   // store will remain scalar if the store is scalarized.
4932   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4933     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4934     assert(WideningDecision != CM_Unknown &&
4935            "Widening decision should be ready at this moment");
4936     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4937       if (Ptr == Store->getValueOperand())
4938         return WideningDecision == CM_Scalarize;
4939     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4940            "Ptr is neither a value or pointer operand");
4941     return WideningDecision != CM_GatherScatter;
4942   };
4943 
4944   // A helper that returns true if the given value is a bitcast or
4945   // getelementptr instruction contained in the loop.
4946   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4947     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4948             isa<GetElementPtrInst>(V)) &&
4949            !TheLoop->isLoopInvariant(V);
4950   };
4951 
4952   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4953     if (!isa<PHINode>(Ptr) ||
4954         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4955       return false;
4956     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4957     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4958       return false;
4959     return isScalarUse(MemAccess, Ptr);
4960   };
4961 
4962   // A helper that evaluates a memory access's use of a pointer. If the
4963   // pointer is actually the pointer induction of a loop, it is being
4964   // inserted into Worklist. If the use will be a scalar use, and the
4965   // pointer is only used by memory accesses, we place the pointer in
4966   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4967   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4968     if (isScalarPtrInduction(MemAccess, Ptr)) {
4969       Worklist.insert(cast<Instruction>(Ptr));
4970       Instruction *Update = cast<Instruction>(
4971           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4972       Worklist.insert(Update);
4973       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4974                         << "\n");
4975       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4976                         << "\n");
4977       return;
4978     }
4979     // We only care about bitcast and getelementptr instructions contained in
4980     // the loop.
4981     if (!isLoopVaryingBitCastOrGEP(Ptr))
4982       return;
4983 
4984     // If the pointer has already been identified as scalar (e.g., if it was
4985     // also identified as uniform), there's nothing to do.
4986     auto *I = cast<Instruction>(Ptr);
4987     if (Worklist.count(I))
4988       return;
4989 
4990     // If the use of the pointer will be a scalar use, and all users of the
4991     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4992     // place the pointer in PossibleNonScalarPtrs.
4993     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4994           return isa<LoadInst>(U) || isa<StoreInst>(U);
4995         }))
4996       ScalarPtrs.insert(I);
4997     else
4998       PossibleNonScalarPtrs.insert(I);
4999   };
5000 
5001   // We seed the scalars analysis with three classes of instructions: (1)
5002   // instructions marked uniform-after-vectorization and (2) bitcast,
5003   // getelementptr and (pointer) phi instructions used by memory accesses
5004   // requiring a scalar use.
5005   //
5006   // (1) Add to the worklist all instructions that have been identified as
5007   // uniform-after-vectorization.
5008   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5009 
5010   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5011   // memory accesses requiring a scalar use. The pointer operands of loads and
5012   // stores will be scalar as long as the memory accesses is not a gather or
5013   // scatter operation. The value operand of a store will remain scalar if the
5014   // store is scalarized.
5015   for (auto *BB : TheLoop->blocks())
5016     for (auto &I : *BB) {
5017       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5018         evaluatePtrUse(Load, Load->getPointerOperand());
5019       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5020         evaluatePtrUse(Store, Store->getPointerOperand());
5021         evaluatePtrUse(Store, Store->getValueOperand());
5022       }
5023     }
5024   for (auto *I : ScalarPtrs)
5025     if (!PossibleNonScalarPtrs.count(I)) {
5026       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5027       Worklist.insert(I);
5028     }
5029 
5030   // Insert the forced scalars.
5031   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5032   // induction variable when the PHI user is scalarized.
5033   auto ForcedScalar = ForcedScalars.find(VF);
5034   if (ForcedScalar != ForcedScalars.end())
5035     for (auto *I : ForcedScalar->second)
5036       Worklist.insert(I);
5037 
5038   // Expand the worklist by looking through any bitcasts and getelementptr
5039   // instructions we've already identified as scalar. This is similar to the
5040   // expansion step in collectLoopUniforms(); however, here we're only
5041   // expanding to include additional bitcasts and getelementptr instructions.
5042   unsigned Idx = 0;
5043   while (Idx != Worklist.size()) {
5044     Instruction *Dst = Worklist[Idx++];
5045     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5046       continue;
5047     auto *Src = cast<Instruction>(Dst->getOperand(0));
5048     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5049           auto *J = cast<Instruction>(U);
5050           return !TheLoop->contains(J) || Worklist.count(J) ||
5051                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5052                   isScalarUse(J, Src));
5053         })) {
5054       Worklist.insert(Src);
5055       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5056     }
5057   }
5058 
5059   // An induction variable will remain scalar if all users of the induction
5060   // variable and induction variable update remain scalar.
5061   for (auto &Induction : Legal->getInductionVars()) {
5062     auto *Ind = Induction.first;
5063     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5064 
5065     // If tail-folding is applied, the primary induction variable will be used
5066     // to feed a vector compare.
5067     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5068       continue;
5069 
5070     // Determine if all users of the induction variable are scalar after
5071     // vectorization.
5072     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5073       auto *I = cast<Instruction>(U);
5074       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5075     });
5076     if (!ScalarInd)
5077       continue;
5078 
5079     // Determine if all users of the induction variable update instruction are
5080     // scalar after vectorization.
5081     auto ScalarIndUpdate =
5082         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5083           auto *I = cast<Instruction>(U);
5084           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5085         });
5086     if (!ScalarIndUpdate)
5087       continue;
5088 
5089     // The induction variable and its update instruction will remain scalar.
5090     Worklist.insert(Ind);
5091     Worklist.insert(IndUpdate);
5092     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5093     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5094                       << "\n");
5095   }
5096 
5097   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5098 }
5099 
5100 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
5101                                                          ElementCount VF) {
5102   assert(!VF.isScalable() && "scalable vectors not yet supported.");
5103   if (!blockNeedsPredication(I->getParent()))
5104     return false;
5105   switch(I->getOpcode()) {
5106   default:
5107     break;
5108   case Instruction::Load:
5109   case Instruction::Store: {
5110     if (!Legal->isMaskRequired(I))
5111       return false;
5112     auto *Ptr = getLoadStorePointerOperand(I);
5113     auto *Ty = getMemInstValueType(I);
5114     // We have already decided how to vectorize this instruction, get that
5115     // result.
5116     if (VF.isVector()) {
5117       InstWidening WideningDecision = getWideningDecision(I, VF);
5118       assert(WideningDecision != CM_Unknown &&
5119              "Widening decision should be ready at this moment");
5120       return WideningDecision == CM_Scalarize;
5121     }
5122     const Align Alignment = getLoadStoreAlignment(I);
5123     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5124                                 isLegalMaskedGather(Ty, Alignment))
5125                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5126                                 isLegalMaskedScatter(Ty, Alignment));
5127   }
5128   case Instruction::UDiv:
5129   case Instruction::SDiv:
5130   case Instruction::SRem:
5131   case Instruction::URem:
5132     return mayDivideByZero(*I);
5133   }
5134   return false;
5135 }
5136 
5137 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5138     Instruction *I, ElementCount VF) {
5139   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5140   assert(getWideningDecision(I, VF) == CM_Unknown &&
5141          "Decision should not be set yet.");
5142   auto *Group = getInterleavedAccessGroup(I);
5143   assert(Group && "Must have a group.");
5144 
5145   // If the instruction's allocated size doesn't equal it's type size, it
5146   // requires padding and will be scalarized.
5147   auto &DL = I->getModule()->getDataLayout();
5148   auto *ScalarTy = getMemInstValueType(I);
5149   if (hasIrregularType(ScalarTy, DL, VF))
5150     return false;
5151 
5152   // Check if masking is required.
5153   // A Group may need masking for one of two reasons: it resides in a block that
5154   // needs predication, or it was decided to use masking to deal with gaps.
5155   bool PredicatedAccessRequiresMasking =
5156       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5157   bool AccessWithGapsRequiresMasking =
5158       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5159   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5160     return true;
5161 
5162   // If masked interleaving is required, we expect that the user/target had
5163   // enabled it, because otherwise it either wouldn't have been created or
5164   // it should have been invalidated by the CostModel.
5165   assert(useMaskedInterleavedAccesses(TTI) &&
5166          "Masked interleave-groups for predicated accesses are not enabled.");
5167 
5168   auto *Ty = getMemInstValueType(I);
5169   const Align Alignment = getLoadStoreAlignment(I);
5170   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5171                           : TTI.isLegalMaskedStore(Ty, Alignment);
5172 }
5173 
5174 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5175     Instruction *I, ElementCount VF) {
5176   // Get and ensure we have a valid memory instruction.
5177   LoadInst *LI = dyn_cast<LoadInst>(I);
5178   StoreInst *SI = dyn_cast<StoreInst>(I);
5179   assert((LI || SI) && "Invalid memory instruction");
5180 
5181   auto *Ptr = getLoadStorePointerOperand(I);
5182 
5183   // In order to be widened, the pointer should be consecutive, first of all.
5184   if (!Legal->isConsecutivePtr(Ptr))
5185     return false;
5186 
5187   // If the instruction is a store located in a predicated block, it will be
5188   // scalarized.
5189   if (isScalarWithPredication(I))
5190     return false;
5191 
5192   // If the instruction's allocated size doesn't equal it's type size, it
5193   // requires padding and will be scalarized.
5194   auto &DL = I->getModule()->getDataLayout();
5195   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5196   if (hasIrregularType(ScalarTy, DL, VF))
5197     return false;
5198 
5199   return true;
5200 }
5201 
5202 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5203   // We should not collect Uniforms more than once per VF. Right now,
5204   // this function is called from collectUniformsAndScalars(), which
5205   // already does this check. Collecting Uniforms for VF=1 does not make any
5206   // sense.
5207 
5208   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5209          "This function should not be visited twice for the same VF");
5210 
5211   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5212   // not analyze again.  Uniforms.count(VF) will return 1.
5213   Uniforms[VF].clear();
5214 
5215   // We now know that the loop is vectorizable!
5216   // Collect instructions inside the loop that will remain uniform after
5217   // vectorization.
5218 
5219   // Global values, params and instructions outside of current loop are out of
5220   // scope.
5221   auto isOutOfScope = [&](Value *V) -> bool {
5222     Instruction *I = dyn_cast<Instruction>(V);
5223     return (!I || !TheLoop->contains(I));
5224   };
5225 
5226   SetVector<Instruction *> Worklist;
5227   BasicBlock *Latch = TheLoop->getLoopLatch();
5228 
5229   // Instructions that are scalar with predication must not be considered
5230   // uniform after vectorization, because that would create an erroneous
5231   // replicating region where only a single instance out of VF should be formed.
5232   // TODO: optimize such seldom cases if found important, see PR40816.
5233   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5234     if (isOutOfScope(I)) {
5235       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5236                         << *I << "\n");
5237       return;
5238     }
5239     if (isScalarWithPredication(I, VF)) {
5240       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5241                         << *I << "\n");
5242       return;
5243     }
5244     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5245     Worklist.insert(I);
5246   };
5247 
5248   // Start with the conditional branch. If the branch condition is an
5249   // instruction contained in the loop that is only used by the branch, it is
5250   // uniform.
5251   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5252   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5253     addToWorklistIfAllowed(Cmp);
5254 
5255   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5256     InstWidening WideningDecision = getWideningDecision(I, VF);
5257     assert(WideningDecision != CM_Unknown &&
5258            "Widening decision should be ready at this moment");
5259 
5260     // A uniform memory op is itself uniform.  We exclude uniform stores
5261     // here as they demand the last lane, not the first one.
5262     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5263       assert(WideningDecision == CM_Scalarize);
5264       return true;
5265     }
5266 
5267     return (WideningDecision == CM_Widen ||
5268             WideningDecision == CM_Widen_Reverse ||
5269             WideningDecision == CM_Interleave);
5270   };
5271 
5272 
5273   // Returns true if Ptr is the pointer operand of a memory access instruction
5274   // I, and I is known to not require scalarization.
5275   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5276     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5277   };
5278 
5279   // Holds a list of values which are known to have at least one uniform use.
5280   // Note that there may be other uses which aren't uniform.  A "uniform use"
5281   // here is something which only demands lane 0 of the unrolled iterations;
5282   // it does not imply that all lanes produce the same value (e.g. this is not
5283   // the usual meaning of uniform)
5284   SmallPtrSet<Value *, 8> HasUniformUse;
5285 
5286   // Scan the loop for instructions which are either a) known to have only
5287   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5288   for (auto *BB : TheLoop->blocks())
5289     for (auto &I : *BB) {
5290       // If there's no pointer operand, there's nothing to do.
5291       auto *Ptr = getLoadStorePointerOperand(&I);
5292       if (!Ptr)
5293         continue;
5294 
5295       // A uniform memory op is itself uniform.  We exclude uniform stores
5296       // here as they demand the last lane, not the first one.
5297       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5298         addToWorklistIfAllowed(&I);
5299 
5300       if (isUniformDecision(&I, VF)) {
5301         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5302         HasUniformUse.insert(Ptr);
5303       }
5304     }
5305 
5306   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5307   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5308   // disallows uses outside the loop as well.
5309   for (auto *V : HasUniformUse) {
5310     if (isOutOfScope(V))
5311       continue;
5312     auto *I = cast<Instruction>(V);
5313     auto UsersAreMemAccesses =
5314       llvm::all_of(I->users(), [&](User *U) -> bool {
5315         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5316       });
5317     if (UsersAreMemAccesses)
5318       addToWorklistIfAllowed(I);
5319   }
5320 
5321   // Expand Worklist in topological order: whenever a new instruction
5322   // is added , its users should be already inside Worklist.  It ensures
5323   // a uniform instruction will only be used by uniform instructions.
5324   unsigned idx = 0;
5325   while (idx != Worklist.size()) {
5326     Instruction *I = Worklist[idx++];
5327 
5328     for (auto OV : I->operand_values()) {
5329       // isOutOfScope operands cannot be uniform instructions.
5330       if (isOutOfScope(OV))
5331         continue;
5332       // First order recurrence Phi's should typically be considered
5333       // non-uniform.
5334       auto *OP = dyn_cast<PHINode>(OV);
5335       if (OP && Legal->isFirstOrderRecurrence(OP))
5336         continue;
5337       // If all the users of the operand are uniform, then add the
5338       // operand into the uniform worklist.
5339       auto *OI = cast<Instruction>(OV);
5340       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5341             auto *J = cast<Instruction>(U);
5342             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5343           }))
5344         addToWorklistIfAllowed(OI);
5345     }
5346   }
5347 
5348   // For an instruction to be added into Worklist above, all its users inside
5349   // the loop should also be in Worklist. However, this condition cannot be
5350   // true for phi nodes that form a cyclic dependence. We must process phi
5351   // nodes separately. An induction variable will remain uniform if all users
5352   // of the induction variable and induction variable update remain uniform.
5353   // The code below handles both pointer and non-pointer induction variables.
5354   for (auto &Induction : Legal->getInductionVars()) {
5355     auto *Ind = Induction.first;
5356     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5357 
5358     // Determine if all users of the induction variable are uniform after
5359     // vectorization.
5360     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5361       auto *I = cast<Instruction>(U);
5362       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5363              isVectorizedMemAccessUse(I, Ind);
5364     });
5365     if (!UniformInd)
5366       continue;
5367 
5368     // Determine if all users of the induction variable update instruction are
5369     // uniform after vectorization.
5370     auto UniformIndUpdate =
5371         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5372           auto *I = cast<Instruction>(U);
5373           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5374                  isVectorizedMemAccessUse(I, IndUpdate);
5375         });
5376     if (!UniformIndUpdate)
5377       continue;
5378 
5379     // The induction variable and its update instruction will remain uniform.
5380     addToWorklistIfAllowed(Ind);
5381     addToWorklistIfAllowed(IndUpdate);
5382   }
5383 
5384   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5385 }
5386 
5387 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5388   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5389 
5390   if (Legal->getRuntimePointerChecking()->Need) {
5391     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5392         "runtime pointer checks needed. Enable vectorization of this "
5393         "loop with '#pragma clang loop vectorize(enable)' when "
5394         "compiling with -Os/-Oz",
5395         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5396     return true;
5397   }
5398 
5399   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5400     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5401         "runtime SCEV checks needed. Enable vectorization of this "
5402         "loop with '#pragma clang loop vectorize(enable)' when "
5403         "compiling with -Os/-Oz",
5404         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5405     return true;
5406   }
5407 
5408   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5409   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5410     reportVectorizationFailure("Runtime stride check for small trip count",
5411         "runtime stride == 1 checks needed. Enable vectorization of "
5412         "this loop without such check by compiling with -Os/-Oz",
5413         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5414     return true;
5415   }
5416 
5417   return false;
5418 }
5419 
5420 Optional<ElementCount>
5421 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5422   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5423     // TODO: It may by useful to do since it's still likely to be dynamically
5424     // uniform if the target can skip.
5425     reportVectorizationFailure(
5426         "Not inserting runtime ptr check for divergent target",
5427         "runtime pointer checks needed. Not enabled for divergent target",
5428         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5429     return None;
5430   }
5431 
5432   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5433   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5434   if (TC == 1) {
5435     reportVectorizationFailure("Single iteration (non) loop",
5436         "loop trip count is one, irrelevant for vectorization",
5437         "SingleIterationLoop", ORE, TheLoop);
5438     return None;
5439   }
5440 
5441   ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5442 
5443   switch (ScalarEpilogueStatus) {
5444   case CM_ScalarEpilogueAllowed:
5445     return MaxVF;
5446   case CM_ScalarEpilogueNotNeededUsePredicate:
5447     LLVM_DEBUG(
5448         dbgs() << "LV: vector predicate hint/switch found.\n"
5449                << "LV: Not allowing scalar epilogue, creating predicated "
5450                << "vector loop.\n");
5451     break;
5452   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5453     // fallthrough as a special case of OptForSize
5454   case CM_ScalarEpilogueNotAllowedOptSize:
5455     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5456       LLVM_DEBUG(
5457           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5458     else
5459       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5460                         << "count.\n");
5461 
5462     // Bail if runtime checks are required, which are not good when optimising
5463     // for size.
5464     if (runtimeChecksRequired())
5465       return None;
5466     break;
5467   }
5468 
5469   // Now try the tail folding
5470 
5471   // Invalidate interleave groups that require an epilogue if we can't mask
5472   // the interleave-group.
5473   if (!useMaskedInterleavedAccesses(TTI)) {
5474     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5475            "No decisions should have been taken at this point");
5476     // Note: There is no need to invalidate any cost modeling decisions here, as
5477     // non where taken so far.
5478     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5479   }
5480 
5481   assert(!MaxVF.isScalable() &&
5482          "Scalable vectors do not yet support tail folding");
5483   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5484          "MaxVF must be a power of 2");
5485   unsigned MaxVFtimesIC =
5486       UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5487   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5488     // Accept MaxVF if we do not have a tail.
5489     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5490     return MaxVF;
5491   }
5492 
5493   // If we don't know the precise trip count, or if the trip count that we
5494   // found modulo the vectorization factor is not zero, try to fold the tail
5495   // by masking.
5496   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5497   if (Legal->prepareToFoldTailByMasking()) {
5498     FoldTailByMasking = true;
5499     return MaxVF;
5500   }
5501 
5502   // If there was a tail-folding hint/switch, but we can't fold the tail by
5503   // masking, fallback to a vectorization with a scalar epilogue.
5504   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5505     if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) {
5506       LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5507       return None;
5508     }
5509     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5510                          "scalar epilogue instead.\n");
5511     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5512     return MaxVF;
5513   }
5514 
5515   if (TC == 0) {
5516     reportVectorizationFailure(
5517         "Unable to calculate the loop count due to complex control flow",
5518         "unable to calculate the loop count due to complex control flow",
5519         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5520     return None;
5521   }
5522 
5523   reportVectorizationFailure(
5524       "Cannot optimize for size and vectorize at the same time.",
5525       "cannot optimize for size and vectorize at the same time. "
5526       "Enable vectorization of this loop with '#pragma clang loop "
5527       "vectorize(enable)' when compiling with -Os/-Oz",
5528       "NoTailLoopWithOptForSize", ORE, TheLoop);
5529   return None;
5530 }
5531 
5532 ElementCount
5533 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5534                                                  ElementCount UserVF) {
5535   assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
5536   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5537   unsigned SmallestType, WidestType;
5538   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5539   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5540 
5541   // Get the maximum safe dependence distance in bits computed by LAA.
5542   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5543   // the memory accesses that is most restrictive (involved in the smallest
5544   // dependence distance).
5545   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5546 
5547   if (UserVF.isNonZero()) {
5548     // If legally unsafe, clamp the user vectorization factor to a safe value.
5549     unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5550     if (UserVF.getFixedValue() <= MaxSafeVF)
5551       return UserVF;
5552 
5553     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5554                       << " is unsafe, clamping to max safe VF=" << MaxSafeVF
5555                       << ".\n");
5556     ORE->emit([&]() {
5557       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5558                                         TheLoop->getStartLoc(),
5559                                         TheLoop->getHeader())
5560              << "User-specified vectorization factor "
5561              << ore::NV("UserVectorizationFactor", UserVF)
5562              << " is unsafe, clamping to maximum safe vectorization factor "
5563              << ore::NV("VectorizationFactor", MaxSafeVF);
5564     });
5565     return ElementCount::getFixed(MaxSafeVF);
5566   }
5567 
5568   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5569 
5570   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5571   // Note that both WidestRegister and WidestType may not be a powers of 2.
5572   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5573 
5574   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5575                     << " / " << WidestType << " bits.\n");
5576   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5577                     << WidestRegister << " bits.\n");
5578 
5579   assert(MaxVectorSize <= WidestRegister &&
5580          "Did not expect to pack so many elements"
5581          " into one vector!");
5582   if (MaxVectorSize == 0) {
5583     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5584     MaxVectorSize = 1;
5585     return ElementCount::getFixed(MaxVectorSize);
5586   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5587              isPowerOf2_32(ConstTripCount)) {
5588     // We need to clamp the VF to be the ConstTripCount. There is no point in
5589     // choosing a higher viable VF as done in the loop below.
5590     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5591                       << ConstTripCount << "\n");
5592     MaxVectorSize = ConstTripCount;
5593     return ElementCount::getFixed(MaxVectorSize);
5594   }
5595 
5596   unsigned MaxVF = MaxVectorSize;
5597   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5598       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5599     // Collect all viable vectorization factors larger than the default MaxVF
5600     // (i.e. MaxVectorSize).
5601     SmallVector<ElementCount, 8> VFs;
5602     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5603     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5604       VFs.push_back(ElementCount::getFixed(VS));
5605 
5606     // For each VF calculate its register usage.
5607     auto RUs = calculateRegisterUsage(VFs);
5608 
5609     // Select the largest VF which doesn't require more registers than existing
5610     // ones.
5611     for (int i = RUs.size() - 1; i >= 0; --i) {
5612       bool Selected = true;
5613       for (auto& pair : RUs[i].MaxLocalUsers) {
5614         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5615         if (pair.second > TargetNumRegisters)
5616           Selected = false;
5617       }
5618       if (Selected) {
5619         MaxVF = VFs[i].getKnownMinValue();
5620         break;
5621       }
5622     }
5623     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5624       if (MaxVF < MinVF) {
5625         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5626                           << ") with target's minimum: " << MinVF << '\n');
5627         MaxVF = MinVF;
5628       }
5629     }
5630   }
5631   return ElementCount::getFixed(MaxVF);
5632 }
5633 
5634 VectorizationFactor
5635 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5636   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5637 
5638   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5639   const float ScalarCost = Cost;
5640   unsigned Width = 1;
5641   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5642 
5643   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5644   if (ForceVectorization && MaxVF.isVector()) {
5645     // Ignore scalar width, because the user explicitly wants vectorization.
5646     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5647     // evaluation.
5648     Cost = std::numeric_limits<float>::max();
5649   }
5650 
5651   for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
5652     // Notice that the vector loop needs to be executed less times, so
5653     // we need to divide the cost of the vector loops by the width of
5654     // the vector elements.
5655     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5656     float VectorCost = C.first / (float)i;
5657     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5658                       << " costs: " << (int)VectorCost << ".\n");
5659     if (!C.second && !ForceVectorization) {
5660       LLVM_DEBUG(
5661           dbgs() << "LV: Not considering vector loop of width " << i
5662                  << " because it will not generate any vector instructions.\n");
5663       continue;
5664     }
5665 
5666     // If profitable add it to ProfitableVF list.
5667     if (VectorCost < ScalarCost) {
5668       ProfitableVFs.push_back(VectorizationFactor(
5669           {ElementCount::getFixed(i), (unsigned)VectorCost}));
5670     }
5671 
5672     if (VectorCost < Cost) {
5673       Cost = VectorCost;
5674       Width = i;
5675     }
5676   }
5677 
5678   if (!EnableCondStoresVectorization && NumPredStores) {
5679     reportVectorizationFailure("There are conditional stores.",
5680         "store that is conditionally executed prevents vectorization",
5681         "ConditionalStore", ORE, TheLoop);
5682     Width = 1;
5683     Cost = ScalarCost;
5684   }
5685 
5686   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5687              << "LV: Vectorization seems to be not beneficial, "
5688              << "but was forced by a user.\n");
5689   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5690   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5691                                 (unsigned)(Width * Cost)};
5692   return Factor;
5693 }
5694 
5695 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5696     const Loop &L, ElementCount VF) const {
5697   // Cross iteration phis such as reductions need special handling and are
5698   // currently unsupported.
5699   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5700         return Legal->isFirstOrderRecurrence(&Phi) ||
5701                Legal->isReductionVariable(&Phi);
5702       }))
5703     return false;
5704 
5705   // Phis with uses outside of the loop require special handling and are
5706   // currently unsupported.
5707   for (auto &Entry : Legal->getInductionVars()) {
5708     // Look for uses of the value of the induction at the last iteration.
5709     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5710     for (User *U : PostInc->users())
5711       if (!L.contains(cast<Instruction>(U)))
5712         return false;
5713     // Look for uses of penultimate value of the induction.
5714     for (User *U : Entry.first->users())
5715       if (!L.contains(cast<Instruction>(U)))
5716         return false;
5717   }
5718 
5719   // Induction variables that are widened require special handling that is
5720   // currently not supported.
5721   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5722         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5723                  this->isProfitableToScalarize(Entry.first, VF));
5724       }))
5725     return false;
5726 
5727   return true;
5728 }
5729 
5730 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5731     const ElementCount VF) const {
5732   // FIXME: We need a much better cost-model to take different parameters such
5733   // as register pressure, code size increase and cost of extra branches into
5734   // account. For now we apply a very crude heuristic and only consider loops
5735   // with vectorization factors larger than a certain value.
5736   // We also consider epilogue vectorization unprofitable for targets that don't
5737   // consider interleaving beneficial (eg. MVE).
5738   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5739     return false;
5740   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5741     return true;
5742   return false;
5743 }
5744 
5745 VectorizationFactor
5746 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5747     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5748   VectorizationFactor Result = VectorizationFactor::Disabled();
5749   if (!EnableEpilogueVectorization) {
5750     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5751     return Result;
5752   }
5753 
5754   if (!isScalarEpilogueAllowed()) {
5755     LLVM_DEBUG(
5756         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5757                   "allowed.\n";);
5758     return Result;
5759   }
5760 
5761   // Not really a cost consideration, but check for unsupported cases here to
5762   // simplify the logic.
5763   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5764     LLVM_DEBUG(
5765         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5766                   "not a supported candidate.\n";);
5767     return Result;
5768   }
5769 
5770   if (EpilogueVectorizationForceVF > 1) {
5771     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5772     if (LVP.hasPlanWithVFs(
5773             {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
5774       return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
5775     else {
5776       LLVM_DEBUG(
5777           dbgs()
5778               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5779       return Result;
5780     }
5781   }
5782 
5783   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5784       TheLoop->getHeader()->getParent()->hasMinSize()) {
5785     LLVM_DEBUG(
5786         dbgs()
5787             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5788     return Result;
5789   }
5790 
5791   if (!isEpilogueVectorizationProfitable(MainLoopVF))
5792     return Result;
5793 
5794   for (auto &NextVF : ProfitableVFs)
5795     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
5796         (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
5797         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
5798       Result = NextVF;
5799 
5800   if (Result != VectorizationFactor::Disabled())
5801     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5802                       << Result.Width.getFixedValue() << "\n";);
5803   return Result;
5804 }
5805 
5806 std::pair<unsigned, unsigned>
5807 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5808   unsigned MinWidth = -1U;
5809   unsigned MaxWidth = 8;
5810   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5811 
5812   // For each block.
5813   for (BasicBlock *BB : TheLoop->blocks()) {
5814     // For each instruction in the loop.
5815     for (Instruction &I : BB->instructionsWithoutDebug()) {
5816       Type *T = I.getType();
5817 
5818       // Skip ignored values.
5819       if (ValuesToIgnore.count(&I))
5820         continue;
5821 
5822       // Only examine Loads, Stores and PHINodes.
5823       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5824         continue;
5825 
5826       // Examine PHI nodes that are reduction variables. Update the type to
5827       // account for the recurrence type.
5828       if (auto *PN = dyn_cast<PHINode>(&I)) {
5829         if (!Legal->isReductionVariable(PN))
5830           continue;
5831         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5832         T = RdxDesc.getRecurrenceType();
5833       }
5834 
5835       // Examine the stored values.
5836       if (auto *ST = dyn_cast<StoreInst>(&I))
5837         T = ST->getValueOperand()->getType();
5838 
5839       // Ignore loaded pointer types and stored pointer types that are not
5840       // vectorizable.
5841       //
5842       // FIXME: The check here attempts to predict whether a load or store will
5843       //        be vectorized. We only know this for certain after a VF has
5844       //        been selected. Here, we assume that if an access can be
5845       //        vectorized, it will be. We should also look at extending this
5846       //        optimization to non-pointer types.
5847       //
5848       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5849           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5850         continue;
5851 
5852       MinWidth = std::min(MinWidth,
5853                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5854       MaxWidth = std::max(MaxWidth,
5855                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5856     }
5857   }
5858 
5859   return {MinWidth, MaxWidth};
5860 }
5861 
5862 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5863                                                            unsigned LoopCost) {
5864   // -- The interleave heuristics --
5865   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5866   // There are many micro-architectural considerations that we can't predict
5867   // at this level. For example, frontend pressure (on decode or fetch) due to
5868   // code size, or the number and capabilities of the execution ports.
5869   //
5870   // We use the following heuristics to select the interleave count:
5871   // 1. If the code has reductions, then we interleave to break the cross
5872   // iteration dependency.
5873   // 2. If the loop is really small, then we interleave to reduce the loop
5874   // overhead.
5875   // 3. We don't interleave if we think that we will spill registers to memory
5876   // due to the increased register pressure.
5877 
5878   if (!isScalarEpilogueAllowed())
5879     return 1;
5880 
5881   // We used the distance for the interleave count.
5882   if (Legal->getMaxSafeDepDistBytes() != -1U)
5883     return 1;
5884 
5885   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5886   const bool HasReductions = !Legal->getReductionVars().empty();
5887   // Do not interleave loops with a relatively small known or estimated trip
5888   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5889   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5890   // because with the above conditions interleaving can expose ILP and break
5891   // cross iteration dependences for reductions.
5892   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5893       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5894     return 1;
5895 
5896   RegisterUsage R = calculateRegisterUsage({VF})[0];
5897   // We divide by these constants so assume that we have at least one
5898   // instruction that uses at least one register.
5899   for (auto& pair : R.MaxLocalUsers) {
5900     pair.second = std::max(pair.second, 1U);
5901   }
5902 
5903   // We calculate the interleave count using the following formula.
5904   // Subtract the number of loop invariants from the number of available
5905   // registers. These registers are used by all of the interleaved instances.
5906   // Next, divide the remaining registers by the number of registers that is
5907   // required by the loop, in order to estimate how many parallel instances
5908   // fit without causing spills. All of this is rounded down if necessary to be
5909   // a power of two. We want power of two interleave count to simplify any
5910   // addressing operations or alignment considerations.
5911   // We also want power of two interleave counts to ensure that the induction
5912   // variable of the vector loop wraps to zero, when tail is folded by masking;
5913   // this currently happens when OptForSize, in which case IC is set to 1 above.
5914   unsigned IC = UINT_MAX;
5915 
5916   for (auto& pair : R.MaxLocalUsers) {
5917     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5918     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5919                       << " registers of "
5920                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5921     if (VF.isScalar()) {
5922       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5923         TargetNumRegisters = ForceTargetNumScalarRegs;
5924     } else {
5925       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5926         TargetNumRegisters = ForceTargetNumVectorRegs;
5927     }
5928     unsigned MaxLocalUsers = pair.second;
5929     unsigned LoopInvariantRegs = 0;
5930     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5931       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5932 
5933     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5934     // Don't count the induction variable as interleaved.
5935     if (EnableIndVarRegisterHeur) {
5936       TmpIC =
5937           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5938                         std::max(1U, (MaxLocalUsers - 1)));
5939     }
5940 
5941     IC = std::min(IC, TmpIC);
5942   }
5943 
5944   // Clamp the interleave ranges to reasonable counts.
5945   assert(!VF.isScalable() && "scalable vectors not yet supported.");
5946   unsigned MaxInterleaveCount =
5947       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5948 
5949   // Check if the user has overridden the max.
5950   if (VF.isScalar()) {
5951     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5952       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5953   } else {
5954     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5955       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5956   }
5957 
5958   // If trip count is known or estimated compile time constant, limit the
5959   // interleave count to be less than the trip count divided by VF, provided it
5960   // is at least 1.
5961   if (BestKnownTC) {
5962     MaxInterleaveCount =
5963         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5964     // Make sure MaxInterleaveCount is greater than 0.
5965     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5966   }
5967 
5968   assert(MaxInterleaveCount > 0 &&
5969          "Maximum interleave count must be greater than 0");
5970 
5971   // Clamp the calculated IC to be between the 1 and the max interleave count
5972   // that the target and trip count allows.
5973   if (IC > MaxInterleaveCount)
5974     IC = MaxInterleaveCount;
5975   else
5976     // Make sure IC is greater than 0.
5977     IC = std::max(1u, IC);
5978 
5979   assert(IC > 0 && "Interleave count must be greater than 0.");
5980 
5981   // If we did not calculate the cost for VF (because the user selected the VF)
5982   // then we calculate the cost of VF here.
5983   if (LoopCost == 0)
5984     LoopCost = expectedCost(VF).first;
5985 
5986   assert(LoopCost && "Non-zero loop cost expected");
5987 
5988   // Interleave if we vectorized this loop and there is a reduction that could
5989   // benefit from interleaving.
5990   if (VF.isVector() && HasReductions) {
5991     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5992     return IC;
5993   }
5994 
5995   // Note that if we've already vectorized the loop we will have done the
5996   // runtime check and so interleaving won't require further checks.
5997   bool InterleavingRequiresRuntimePointerCheck =
5998       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5999 
6000   // We want to interleave small loops in order to reduce the loop overhead and
6001   // potentially expose ILP opportunities.
6002   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6003                     << "LV: IC is " << IC << '\n'
6004                     << "LV: VF is " << VF.getKnownMinValue() << '\n');
6005   const bool AggressivelyInterleaveReductions =
6006       TTI.enableAggressiveInterleaving(HasReductions);
6007   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6008     // We assume that the cost overhead is 1 and we use the cost model
6009     // to estimate the cost of the loop and interleave until the cost of the
6010     // loop overhead is about 5% of the cost of the loop.
6011     unsigned SmallIC =
6012         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6013 
6014     // Interleave until store/load ports (estimated by max interleave count) are
6015     // saturated.
6016     unsigned NumStores = Legal->getNumStores();
6017     unsigned NumLoads = Legal->getNumLoads();
6018     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6019     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6020 
6021     // If we have a scalar reduction (vector reductions are already dealt with
6022     // by this point), we can increase the critical path length if the loop
6023     // we're interleaving is inside another loop. Limit, by default to 2, so the
6024     // critical path only gets increased by one reduction operation.
6025     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6026       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6027       SmallIC = std::min(SmallIC, F);
6028       StoresIC = std::min(StoresIC, F);
6029       LoadsIC = std::min(LoadsIC, F);
6030     }
6031 
6032     if (EnableLoadStoreRuntimeInterleave &&
6033         std::max(StoresIC, LoadsIC) > SmallIC) {
6034       LLVM_DEBUG(
6035           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6036       return std::max(StoresIC, LoadsIC);
6037     }
6038 
6039     // If there are scalar reductions and TTI has enabled aggressive
6040     // interleaving for reductions, we will interleave to expose ILP.
6041     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6042         AggressivelyInterleaveReductions) {
6043       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6044       // Interleave no less than SmallIC but not as aggressive as the normal IC
6045       // to satisfy the rare situation when resources are too limited.
6046       return std::max(IC / 2, SmallIC);
6047     } else {
6048       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6049       return SmallIC;
6050     }
6051   }
6052 
6053   // Interleave if this is a large loop (small loops are already dealt with by
6054   // this point) that could benefit from interleaving.
6055   if (AggressivelyInterleaveReductions) {
6056     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6057     return IC;
6058   }
6059 
6060   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6061   return 1;
6062 }
6063 
6064 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6065 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6066   // This function calculates the register usage by measuring the highest number
6067   // of values that are alive at a single location. Obviously, this is a very
6068   // rough estimation. We scan the loop in a topological order in order and
6069   // assign a number to each instruction. We use RPO to ensure that defs are
6070   // met before their users. We assume that each instruction that has in-loop
6071   // users starts an interval. We record every time that an in-loop value is
6072   // used, so we have a list of the first and last occurrences of each
6073   // instruction. Next, we transpose this data structure into a multi map that
6074   // holds the list of intervals that *end* at a specific location. This multi
6075   // map allows us to perform a linear search. We scan the instructions linearly
6076   // and record each time that a new interval starts, by placing it in a set.
6077   // If we find this value in the multi-map then we remove it from the set.
6078   // The max register usage is the maximum size of the set.
6079   // We also search for instructions that are defined outside the loop, but are
6080   // used inside the loop. We need this number separately from the max-interval
6081   // usage number because when we unroll, loop-invariant values do not take
6082   // more register.
6083   LoopBlocksDFS DFS(TheLoop);
6084   DFS.perform(LI);
6085 
6086   RegisterUsage RU;
6087 
6088   // Each 'key' in the map opens a new interval. The values
6089   // of the map are the index of the 'last seen' usage of the
6090   // instruction that is the key.
6091   using IntervalMap = DenseMap<Instruction *, unsigned>;
6092 
6093   // Maps instruction to its index.
6094   SmallVector<Instruction *, 64> IdxToInstr;
6095   // Marks the end of each interval.
6096   IntervalMap EndPoint;
6097   // Saves the list of instruction indices that are used in the loop.
6098   SmallPtrSet<Instruction *, 8> Ends;
6099   // Saves the list of values that are used in the loop but are
6100   // defined outside the loop, such as arguments and constants.
6101   SmallPtrSet<Value *, 8> LoopInvariants;
6102 
6103   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6104     for (Instruction &I : BB->instructionsWithoutDebug()) {
6105       IdxToInstr.push_back(&I);
6106 
6107       // Save the end location of each USE.
6108       for (Value *U : I.operands()) {
6109         auto *Instr = dyn_cast<Instruction>(U);
6110 
6111         // Ignore non-instruction values such as arguments, constants, etc.
6112         if (!Instr)
6113           continue;
6114 
6115         // If this instruction is outside the loop then record it and continue.
6116         if (!TheLoop->contains(Instr)) {
6117           LoopInvariants.insert(Instr);
6118           continue;
6119         }
6120 
6121         // Overwrite previous end points.
6122         EndPoint[Instr] = IdxToInstr.size();
6123         Ends.insert(Instr);
6124       }
6125     }
6126   }
6127 
6128   // Saves the list of intervals that end with the index in 'key'.
6129   using InstrList = SmallVector<Instruction *, 2>;
6130   DenseMap<unsigned, InstrList> TransposeEnds;
6131 
6132   // Transpose the EndPoints to a list of values that end at each index.
6133   for (auto &Interval : EndPoint)
6134     TransposeEnds[Interval.second].push_back(Interval.first);
6135 
6136   SmallPtrSet<Instruction *, 8> OpenIntervals;
6137   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6138   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6139 
6140   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6141 
6142   // A lambda that gets the register usage for the given type and VF.
6143   const auto &TTICapture = TTI;
6144   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6145     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6146       return 0U;
6147     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6148   };
6149 
6150   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6151     Instruction *I = IdxToInstr[i];
6152 
6153     // Remove all of the instructions that end at this location.
6154     InstrList &List = TransposeEnds[i];
6155     for (Instruction *ToRemove : List)
6156       OpenIntervals.erase(ToRemove);
6157 
6158     // Ignore instructions that are never used within the loop.
6159     if (!Ends.count(I))
6160       continue;
6161 
6162     // Skip ignored values.
6163     if (ValuesToIgnore.count(I))
6164       continue;
6165 
6166     // For each VF find the maximum usage of registers.
6167     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6168       // Count the number of live intervals.
6169       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6170 
6171       if (VFs[j].isScalar()) {
6172         for (auto Inst : OpenIntervals) {
6173           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6174           if (RegUsage.find(ClassID) == RegUsage.end())
6175             RegUsage[ClassID] = 1;
6176           else
6177             RegUsage[ClassID] += 1;
6178         }
6179       } else {
6180         collectUniformsAndScalars(VFs[j]);
6181         for (auto Inst : OpenIntervals) {
6182           // Skip ignored values for VF > 1.
6183           if (VecValuesToIgnore.count(Inst))
6184             continue;
6185           if (isScalarAfterVectorization(Inst, VFs[j])) {
6186             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6187             if (RegUsage.find(ClassID) == RegUsage.end())
6188               RegUsage[ClassID] = 1;
6189             else
6190               RegUsage[ClassID] += 1;
6191           } else {
6192             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6193             if (RegUsage.find(ClassID) == RegUsage.end())
6194               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6195             else
6196               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6197           }
6198         }
6199       }
6200 
6201       for (auto& pair : RegUsage) {
6202         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6203           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6204         else
6205           MaxUsages[j][pair.first] = pair.second;
6206       }
6207     }
6208 
6209     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6210                       << OpenIntervals.size() << '\n');
6211 
6212     // Add the current instruction to the list of open intervals.
6213     OpenIntervals.insert(I);
6214   }
6215 
6216   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6217     SmallMapVector<unsigned, unsigned, 4> Invariant;
6218 
6219     for (auto Inst : LoopInvariants) {
6220       unsigned Usage =
6221           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6222       unsigned ClassID =
6223           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6224       if (Invariant.find(ClassID) == Invariant.end())
6225         Invariant[ClassID] = Usage;
6226       else
6227         Invariant[ClassID] += Usage;
6228     }
6229 
6230     LLVM_DEBUG({
6231       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6232       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6233              << " item\n";
6234       for (const auto &pair : MaxUsages[i]) {
6235         dbgs() << "LV(REG): RegisterClass: "
6236                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6237                << " registers\n";
6238       }
6239       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6240              << " item\n";
6241       for (const auto &pair : Invariant) {
6242         dbgs() << "LV(REG): RegisterClass: "
6243                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6244                << " registers\n";
6245       }
6246     });
6247 
6248     RU.LoopInvariantRegs = Invariant;
6249     RU.MaxLocalUsers = MaxUsages[i];
6250     RUs[i] = RU;
6251   }
6252 
6253   return RUs;
6254 }
6255 
6256 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6257   // TODO: Cost model for emulated masked load/store is completely
6258   // broken. This hack guides the cost model to use an artificially
6259   // high enough value to practically disable vectorization with such
6260   // operations, except where previously deployed legality hack allowed
6261   // using very low cost values. This is to avoid regressions coming simply
6262   // from moving "masked load/store" check from legality to cost model.
6263   // Masked Load/Gather emulation was previously never allowed.
6264   // Limited number of Masked Store/Scatter emulation was allowed.
6265   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
6266   return isa<LoadInst>(I) ||
6267          (isa<StoreInst>(I) &&
6268           NumPredStores > NumberOfStoresToPredicate);
6269 }
6270 
6271 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6272   // If we aren't vectorizing the loop, or if we've already collected the
6273   // instructions to scalarize, there's nothing to do. Collection may already
6274   // have occurred if we have a user-selected VF and are now computing the
6275   // expected cost for interleaving.
6276   if (VF.isScalar() || VF.isZero() ||
6277       InstsToScalarize.find(VF) != InstsToScalarize.end())
6278     return;
6279 
6280   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6281   // not profitable to scalarize any instructions, the presence of VF in the
6282   // map will indicate that we've analyzed it already.
6283   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6284 
6285   // Find all the instructions that are scalar with predication in the loop and
6286   // determine if it would be better to not if-convert the blocks they are in.
6287   // If so, we also record the instructions to scalarize.
6288   for (BasicBlock *BB : TheLoop->blocks()) {
6289     if (!blockNeedsPredication(BB))
6290       continue;
6291     for (Instruction &I : *BB)
6292       if (isScalarWithPredication(&I)) {
6293         ScalarCostsTy ScalarCosts;
6294         // Do not apply discount logic if hacked cost is needed
6295         // for emulated masked memrefs.
6296         if (!useEmulatedMaskMemRefHack(&I) &&
6297             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6298           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6299         // Remember that BB will remain after vectorization.
6300         PredicatedBBsAfterVectorization.insert(BB);
6301       }
6302   }
6303 }
6304 
6305 int LoopVectorizationCostModel::computePredInstDiscount(
6306     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
6307     ElementCount VF) {
6308   assert(!isUniformAfterVectorization(PredInst, VF) &&
6309          "Instruction marked uniform-after-vectorization will be predicated");
6310 
6311   // Initialize the discount to zero, meaning that the scalar version and the
6312   // vector version cost the same.
6313   int Discount = 0;
6314 
6315   // Holds instructions to analyze. The instructions we visit are mapped in
6316   // ScalarCosts. Those instructions are the ones that would be scalarized if
6317   // we find that the scalar version costs less.
6318   SmallVector<Instruction *, 8> Worklist;
6319 
6320   // Returns true if the given instruction can be scalarized.
6321   auto canBeScalarized = [&](Instruction *I) -> bool {
6322     // We only attempt to scalarize instructions forming a single-use chain
6323     // from the original predicated block that would otherwise be vectorized.
6324     // Although not strictly necessary, we give up on instructions we know will
6325     // already be scalar to avoid traversing chains that are unlikely to be
6326     // beneficial.
6327     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6328         isScalarAfterVectorization(I, VF))
6329       return false;
6330 
6331     // If the instruction is scalar with predication, it will be analyzed
6332     // separately. We ignore it within the context of PredInst.
6333     if (isScalarWithPredication(I))
6334       return false;
6335 
6336     // If any of the instruction's operands are uniform after vectorization,
6337     // the instruction cannot be scalarized. This prevents, for example, a
6338     // masked load from being scalarized.
6339     //
6340     // We assume we will only emit a value for lane zero of an instruction
6341     // marked uniform after vectorization, rather than VF identical values.
6342     // Thus, if we scalarize an instruction that uses a uniform, we would
6343     // create uses of values corresponding to the lanes we aren't emitting code
6344     // for. This behavior can be changed by allowing getScalarValue to clone
6345     // the lane zero values for uniforms rather than asserting.
6346     for (Use &U : I->operands())
6347       if (auto *J = dyn_cast<Instruction>(U.get()))
6348         if (isUniformAfterVectorization(J, VF))
6349           return false;
6350 
6351     // Otherwise, we can scalarize the instruction.
6352     return true;
6353   };
6354 
6355   // Compute the expected cost discount from scalarizing the entire expression
6356   // feeding the predicated instruction. We currently only consider expressions
6357   // that are single-use instruction chains.
6358   Worklist.push_back(PredInst);
6359   while (!Worklist.empty()) {
6360     Instruction *I = Worklist.pop_back_val();
6361 
6362     // If we've already analyzed the instruction, there's nothing to do.
6363     if (ScalarCosts.find(I) != ScalarCosts.end())
6364       continue;
6365 
6366     // Compute the cost of the vector instruction. Note that this cost already
6367     // includes the scalarization overhead of the predicated instruction.
6368     unsigned VectorCost = getInstructionCost(I, VF).first;
6369 
6370     // Compute the cost of the scalarized instruction. This cost is the cost of
6371     // the instruction as if it wasn't if-converted and instead remained in the
6372     // predicated block. We will scale this cost by block probability after
6373     // computing the scalarization overhead.
6374     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6375     unsigned ScalarCost =
6376         VF.getKnownMinValue() *
6377         getInstructionCost(I, ElementCount::getFixed(1)).first;
6378 
6379     // Compute the scalarization overhead of needed insertelement instructions
6380     // and phi nodes.
6381     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6382       ScalarCost += TTI.getScalarizationOverhead(
6383           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6384           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6385       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6386       ScalarCost +=
6387           VF.getKnownMinValue() *
6388           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6389     }
6390 
6391     // Compute the scalarization overhead of needed extractelement
6392     // instructions. For each of the instruction's operands, if the operand can
6393     // be scalarized, add it to the worklist; otherwise, account for the
6394     // overhead.
6395     for (Use &U : I->operands())
6396       if (auto *J = dyn_cast<Instruction>(U.get())) {
6397         assert(VectorType::isValidElementType(J->getType()) &&
6398                "Instruction has non-scalar type");
6399         if (canBeScalarized(J))
6400           Worklist.push_back(J);
6401         else if (needsExtract(J, VF)) {
6402           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6403           ScalarCost += TTI.getScalarizationOverhead(
6404               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6405               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6406         }
6407       }
6408 
6409     // Scale the total scalar cost by block probability.
6410     ScalarCost /= getReciprocalPredBlockProb();
6411 
6412     // Compute the discount. A non-negative discount means the vector version
6413     // of the instruction costs more, and scalarizing would be beneficial.
6414     Discount += VectorCost - ScalarCost;
6415     ScalarCosts[I] = ScalarCost;
6416   }
6417 
6418   return Discount;
6419 }
6420 
6421 LoopVectorizationCostModel::VectorizationCostTy
6422 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6423   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6424   VectorizationCostTy Cost;
6425 
6426   // For each block.
6427   for (BasicBlock *BB : TheLoop->blocks()) {
6428     VectorizationCostTy BlockCost;
6429 
6430     // For each instruction in the old loop.
6431     for (Instruction &I : BB->instructionsWithoutDebug()) {
6432       // Skip ignored values.
6433       if (ValuesToIgnore.count(&I) ||
6434           (VF.isVector() && VecValuesToIgnore.count(&I)))
6435         continue;
6436 
6437       VectorizationCostTy C = getInstructionCost(&I, VF);
6438 
6439       // Check if we should override the cost.
6440       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6441         C.first = ForceTargetInstructionCost;
6442 
6443       BlockCost.first += C.first;
6444       BlockCost.second |= C.second;
6445       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6446                         << " for VF " << VF << " For instruction: " << I
6447                         << '\n');
6448     }
6449 
6450     // If we are vectorizing a predicated block, it will have been
6451     // if-converted. This means that the block's instructions (aside from
6452     // stores and instructions that may divide by zero) will now be
6453     // unconditionally executed. For the scalar case, we may not always execute
6454     // the predicated block. Thus, scale the block's cost by the probability of
6455     // executing it.
6456     if (VF.isScalar() && blockNeedsPredication(BB))
6457       BlockCost.first /= getReciprocalPredBlockProb();
6458 
6459     Cost.first += BlockCost.first;
6460     Cost.second |= BlockCost.second;
6461   }
6462 
6463   return Cost;
6464 }
6465 
6466 /// Gets Address Access SCEV after verifying that the access pattern
6467 /// is loop invariant except the induction variable dependence.
6468 ///
6469 /// This SCEV can be sent to the Target in order to estimate the address
6470 /// calculation cost.
6471 static const SCEV *getAddressAccessSCEV(
6472               Value *Ptr,
6473               LoopVectorizationLegality *Legal,
6474               PredicatedScalarEvolution &PSE,
6475               const Loop *TheLoop) {
6476 
6477   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6478   if (!Gep)
6479     return nullptr;
6480 
6481   // We are looking for a gep with all loop invariant indices except for one
6482   // which should be an induction variable.
6483   auto SE = PSE.getSE();
6484   unsigned NumOperands = Gep->getNumOperands();
6485   for (unsigned i = 1; i < NumOperands; ++i) {
6486     Value *Opd = Gep->getOperand(i);
6487     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6488         !Legal->isInductionVariable(Opd))
6489       return nullptr;
6490   }
6491 
6492   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6493   return PSE.getSCEV(Ptr);
6494 }
6495 
6496 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6497   return Legal->hasStride(I->getOperand(0)) ||
6498          Legal->hasStride(I->getOperand(1));
6499 }
6500 
6501 unsigned
6502 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6503                                                         ElementCount VF) {
6504   assert(VF.isVector() &&
6505          "Scalarization cost of instruction implies vectorization.");
6506   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6507   Type *ValTy = getMemInstValueType(I);
6508   auto SE = PSE.getSE();
6509 
6510   unsigned AS = getLoadStoreAddressSpace(I);
6511   Value *Ptr = getLoadStorePointerOperand(I);
6512   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6513 
6514   // Figure out whether the access is strided and get the stride value
6515   // if it's known in compile time
6516   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6517 
6518   // Get the cost of the scalar memory instruction and address computation.
6519   unsigned Cost =
6520       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6521 
6522   // Don't pass *I here, since it is scalar but will actually be part of a
6523   // vectorized loop where the user of it is a vectorized instruction.
6524   const Align Alignment = getLoadStoreAlignment(I);
6525   Cost += VF.getKnownMinValue() *
6526           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6527                               AS, TTI::TCK_RecipThroughput);
6528 
6529   // Get the overhead of the extractelement and insertelement instructions
6530   // we might create due to scalarization.
6531   Cost += getScalarizationOverhead(I, VF);
6532 
6533   // If we have a predicated store, it may not be executed for each vector
6534   // lane. Scale the cost by the probability of executing the predicated
6535   // block.
6536   if (isPredicatedInst(I)) {
6537     Cost /= getReciprocalPredBlockProb();
6538 
6539     if (useEmulatedMaskMemRefHack(I))
6540       // Artificially setting to a high enough value to practically disable
6541       // vectorization with such operations.
6542       Cost = 3000000;
6543   }
6544 
6545   return Cost;
6546 }
6547 
6548 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6549                                                              ElementCount VF) {
6550   Type *ValTy = getMemInstValueType(I);
6551   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6552   Value *Ptr = getLoadStorePointerOperand(I);
6553   unsigned AS = getLoadStoreAddressSpace(I);
6554   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6555   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6556 
6557   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6558          "Stride should be 1 or -1 for consecutive memory access");
6559   const Align Alignment = getLoadStoreAlignment(I);
6560   unsigned Cost = 0;
6561   if (Legal->isMaskRequired(I))
6562     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6563                                       CostKind);
6564   else
6565     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6566                                 CostKind, I);
6567 
6568   bool Reverse = ConsecutiveStride < 0;
6569   if (Reverse)
6570     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6571   return Cost;
6572 }
6573 
6574 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6575                                                          ElementCount VF) {
6576   assert(Legal->isUniformMemOp(*I));
6577 
6578   Type *ValTy = getMemInstValueType(I);
6579   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6580   const Align Alignment = getLoadStoreAlignment(I);
6581   unsigned AS = getLoadStoreAddressSpace(I);
6582   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6583   if (isa<LoadInst>(I)) {
6584     return TTI.getAddressComputationCost(ValTy) +
6585            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6586                                CostKind) +
6587            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6588   }
6589   StoreInst *SI = cast<StoreInst>(I);
6590 
6591   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6592   return TTI.getAddressComputationCost(ValTy) +
6593          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6594                              CostKind) +
6595          (isLoopInvariantStoreValue
6596               ? 0
6597               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6598                                        VF.getKnownMinValue() - 1));
6599 }
6600 
6601 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6602                                                           ElementCount VF) {
6603   Type *ValTy = getMemInstValueType(I);
6604   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6605   const Align Alignment = getLoadStoreAlignment(I);
6606   const Value *Ptr = getLoadStorePointerOperand(I);
6607 
6608   return TTI.getAddressComputationCost(VectorTy) +
6609          TTI.getGatherScatterOpCost(
6610              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6611              TargetTransformInfo::TCK_RecipThroughput, I);
6612 }
6613 
6614 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6615                                                             ElementCount VF) {
6616   Type *ValTy = getMemInstValueType(I);
6617   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6618   unsigned AS = getLoadStoreAddressSpace(I);
6619 
6620   auto Group = getInterleavedAccessGroup(I);
6621   assert(Group && "Fail to get an interleaved access group.");
6622 
6623   unsigned InterleaveFactor = Group->getFactor();
6624   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6625   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6626 
6627   // Holds the indices of existing members in an interleaved load group.
6628   // An interleaved store group doesn't need this as it doesn't allow gaps.
6629   SmallVector<unsigned, 4> Indices;
6630   if (isa<LoadInst>(I)) {
6631     for (unsigned i = 0; i < InterleaveFactor; i++)
6632       if (Group->getMember(i))
6633         Indices.push_back(i);
6634   }
6635 
6636   // Calculate the cost of the whole interleaved group.
6637   bool UseMaskForGaps =
6638       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6639   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6640       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6641       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6642 
6643   if (Group->isReverse()) {
6644     // TODO: Add support for reversed masked interleaved access.
6645     assert(!Legal->isMaskRequired(I) &&
6646            "Reverse masked interleaved access not supported.");
6647     Cost += Group->getNumMembers() *
6648             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6649   }
6650   return Cost;
6651 }
6652 
6653 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6654                                                               ElementCount VF) {
6655   // Calculate scalar cost only. Vectorization cost should be ready at this
6656   // moment.
6657   if (VF.isScalar()) {
6658     Type *ValTy = getMemInstValueType(I);
6659     const Align Alignment = getLoadStoreAlignment(I);
6660     unsigned AS = getLoadStoreAddressSpace(I);
6661 
6662     return TTI.getAddressComputationCost(ValTy) +
6663            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6664                                TTI::TCK_RecipThroughput, I);
6665   }
6666   return getWideningCost(I, VF);
6667 }
6668 
6669 LoopVectorizationCostModel::VectorizationCostTy
6670 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6671                                                ElementCount VF) {
6672   assert(!VF.isScalable() &&
6673          "the cost model is not yet implemented for scalable vectorization");
6674   // If we know that this instruction will remain uniform, check the cost of
6675   // the scalar version.
6676   if (isUniformAfterVectorization(I, VF))
6677     VF = ElementCount::getFixed(1);
6678 
6679   if (VF.isVector() && isProfitableToScalarize(I, VF))
6680     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6681 
6682   // Forced scalars do not have any scalarization overhead.
6683   auto ForcedScalar = ForcedScalars.find(VF);
6684   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6685     auto InstSet = ForcedScalar->second;
6686     if (InstSet.count(I))
6687       return VectorizationCostTy(
6688           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6689            VF.getKnownMinValue()),
6690           false);
6691   }
6692 
6693   Type *VectorTy;
6694   unsigned C = getInstructionCost(I, VF, VectorTy);
6695 
6696   bool TypeNotScalarized =
6697       VF.isVector() && VectorTy->isVectorTy() &&
6698       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6699   return VectorizationCostTy(C, TypeNotScalarized);
6700 }
6701 
6702 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6703                                                               ElementCount VF) {
6704 
6705   assert(!VF.isScalable() &&
6706          "cannot compute scalarization overhead for scalable vectorization");
6707   if (VF.isScalar())
6708     return 0;
6709 
6710   unsigned Cost = 0;
6711   Type *RetTy = ToVectorTy(I->getType(), VF);
6712   if (!RetTy->isVoidTy() &&
6713       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6714     Cost += TTI.getScalarizationOverhead(
6715         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6716         true, false);
6717 
6718   // Some targets keep addresses scalar.
6719   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6720     return Cost;
6721 
6722   // Some targets support efficient element stores.
6723   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6724     return Cost;
6725 
6726   // Collect operands to consider.
6727   CallInst *CI = dyn_cast<CallInst>(I);
6728   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6729 
6730   // Skip operands that do not require extraction/scalarization and do not incur
6731   // any overhead.
6732   return Cost + TTI.getOperandsScalarizationOverhead(
6733                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6734 }
6735 
6736 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6737   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6738   if (VF.isScalar())
6739     return;
6740   NumPredStores = 0;
6741   for (BasicBlock *BB : TheLoop->blocks()) {
6742     // For each instruction in the old loop.
6743     for (Instruction &I : *BB) {
6744       Value *Ptr =  getLoadStorePointerOperand(&I);
6745       if (!Ptr)
6746         continue;
6747 
6748       // TODO: We should generate better code and update the cost model for
6749       // predicated uniform stores. Today they are treated as any other
6750       // predicated store (see added test cases in
6751       // invariant-store-vectorization.ll).
6752       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6753         NumPredStores++;
6754 
6755       if (Legal->isUniformMemOp(I)) {
6756         // TODO: Avoid replicating loads and stores instead of
6757         // relying on instcombine to remove them.
6758         // Load: Scalar load + broadcast
6759         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6760         unsigned Cost = getUniformMemOpCost(&I, VF);
6761         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6762         continue;
6763       }
6764 
6765       // We assume that widening is the best solution when possible.
6766       if (memoryInstructionCanBeWidened(&I, VF)) {
6767         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6768         int ConsecutiveStride =
6769                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6770         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6771                "Expected consecutive stride.");
6772         InstWidening Decision =
6773             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6774         setWideningDecision(&I, VF, Decision, Cost);
6775         continue;
6776       }
6777 
6778       // Choose between Interleaving, Gather/Scatter or Scalarization.
6779       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6780       unsigned NumAccesses = 1;
6781       if (isAccessInterleaved(&I)) {
6782         auto Group = getInterleavedAccessGroup(&I);
6783         assert(Group && "Fail to get an interleaved access group.");
6784 
6785         // Make one decision for the whole group.
6786         if (getWideningDecision(&I, VF) != CM_Unknown)
6787           continue;
6788 
6789         NumAccesses = Group->getNumMembers();
6790         if (interleavedAccessCanBeWidened(&I, VF))
6791           InterleaveCost = getInterleaveGroupCost(&I, VF);
6792       }
6793 
6794       unsigned GatherScatterCost =
6795           isLegalGatherOrScatter(&I)
6796               ? getGatherScatterCost(&I, VF) * NumAccesses
6797               : std::numeric_limits<unsigned>::max();
6798 
6799       unsigned ScalarizationCost =
6800           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6801 
6802       // Choose better solution for the current VF,
6803       // write down this decision and use it during vectorization.
6804       unsigned Cost;
6805       InstWidening Decision;
6806       if (InterleaveCost <= GatherScatterCost &&
6807           InterleaveCost < ScalarizationCost) {
6808         Decision = CM_Interleave;
6809         Cost = InterleaveCost;
6810       } else if (GatherScatterCost < ScalarizationCost) {
6811         Decision = CM_GatherScatter;
6812         Cost = GatherScatterCost;
6813       } else {
6814         Decision = CM_Scalarize;
6815         Cost = ScalarizationCost;
6816       }
6817       // If the instructions belongs to an interleave group, the whole group
6818       // receives the same decision. The whole group receives the cost, but
6819       // the cost will actually be assigned to one instruction.
6820       if (auto Group = getInterleavedAccessGroup(&I))
6821         setWideningDecision(Group, VF, Decision, Cost);
6822       else
6823         setWideningDecision(&I, VF, Decision, Cost);
6824     }
6825   }
6826 
6827   // Make sure that any load of address and any other address computation
6828   // remains scalar unless there is gather/scatter support. This avoids
6829   // inevitable extracts into address registers, and also has the benefit of
6830   // activating LSR more, since that pass can't optimize vectorized
6831   // addresses.
6832   if (TTI.prefersVectorizedAddressing())
6833     return;
6834 
6835   // Start with all scalar pointer uses.
6836   SmallPtrSet<Instruction *, 8> AddrDefs;
6837   for (BasicBlock *BB : TheLoop->blocks())
6838     for (Instruction &I : *BB) {
6839       Instruction *PtrDef =
6840         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6841       if (PtrDef && TheLoop->contains(PtrDef) &&
6842           getWideningDecision(&I, VF) != CM_GatherScatter)
6843         AddrDefs.insert(PtrDef);
6844     }
6845 
6846   // Add all instructions used to generate the addresses.
6847   SmallVector<Instruction *, 4> Worklist;
6848   for (auto *I : AddrDefs)
6849     Worklist.push_back(I);
6850   while (!Worklist.empty()) {
6851     Instruction *I = Worklist.pop_back_val();
6852     for (auto &Op : I->operands())
6853       if (auto *InstOp = dyn_cast<Instruction>(Op))
6854         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6855             AddrDefs.insert(InstOp).second)
6856           Worklist.push_back(InstOp);
6857   }
6858 
6859   for (auto *I : AddrDefs) {
6860     if (isa<LoadInst>(I)) {
6861       // Setting the desired widening decision should ideally be handled in
6862       // by cost functions, but since this involves the task of finding out
6863       // if the loaded register is involved in an address computation, it is
6864       // instead changed here when we know this is the case.
6865       InstWidening Decision = getWideningDecision(I, VF);
6866       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6867         // Scalarize a widened load of address.
6868         setWideningDecision(
6869             I, VF, CM_Scalarize,
6870             (VF.getKnownMinValue() *
6871              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6872       else if (auto Group = getInterleavedAccessGroup(I)) {
6873         // Scalarize an interleave group of address loads.
6874         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6875           if (Instruction *Member = Group->getMember(I))
6876             setWideningDecision(
6877                 Member, VF, CM_Scalarize,
6878                 (VF.getKnownMinValue() *
6879                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6880         }
6881       }
6882     } else
6883       // Make sure I gets scalarized and a cost estimate without
6884       // scalarization overhead.
6885       ForcedScalars[VF].insert(I);
6886   }
6887 }
6888 
6889 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6890                                                         ElementCount VF,
6891                                                         Type *&VectorTy) {
6892   Type *RetTy = I->getType();
6893   if (canTruncateToMinimalBitwidth(I, VF))
6894     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6895   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6896   auto SE = PSE.getSE();
6897   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6898 
6899   // TODO: We need to estimate the cost of intrinsic calls.
6900   switch (I->getOpcode()) {
6901   case Instruction::GetElementPtr:
6902     // We mark this instruction as zero-cost because the cost of GEPs in
6903     // vectorized code depends on whether the corresponding memory instruction
6904     // is scalarized or not. Therefore, we handle GEPs with the memory
6905     // instruction cost.
6906     return 0;
6907   case Instruction::Br: {
6908     // In cases of scalarized and predicated instructions, there will be VF
6909     // predicated blocks in the vectorized loop. Each branch around these
6910     // blocks requires also an extract of its vector compare i1 element.
6911     bool ScalarPredicatedBB = false;
6912     BranchInst *BI = cast<BranchInst>(I);
6913     if (VF.isVector() && BI->isConditional() &&
6914         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6915          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6916       ScalarPredicatedBB = true;
6917 
6918     if (ScalarPredicatedBB) {
6919       // Return cost for branches around scalarized and predicated blocks.
6920       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6921       auto *Vec_i1Ty =
6922           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6923       return (TTI.getScalarizationOverhead(
6924                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6925                   false, true) +
6926               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
6927                VF.getKnownMinValue()));
6928     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6929       // The back-edge branch will remain, as will all scalar branches.
6930       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6931     else
6932       // This branch will be eliminated by if-conversion.
6933       return 0;
6934     // Note: We currently assume zero cost for an unconditional branch inside
6935     // a predicated block since it will become a fall-through, although we
6936     // may decide in the future to call TTI for all branches.
6937   }
6938   case Instruction::PHI: {
6939     auto *Phi = cast<PHINode>(I);
6940 
6941     // First-order recurrences are replaced by vector shuffles inside the loop.
6942     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6943     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6944       return TTI.getShuffleCost(
6945           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
6946           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
6947 
6948     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6949     // converted into select instructions. We require N - 1 selects per phi
6950     // node, where N is the number of incoming values.
6951     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6952       return (Phi->getNumIncomingValues() - 1) *
6953              TTI.getCmpSelInstrCost(
6954                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6955                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6956                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6957 
6958     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6959   }
6960   case Instruction::UDiv:
6961   case Instruction::SDiv:
6962   case Instruction::URem:
6963   case Instruction::SRem:
6964     // If we have a predicated instruction, it may not be executed for each
6965     // vector lane. Get the scalarization cost and scale this amount by the
6966     // probability of executing the predicated block. If the instruction is not
6967     // predicated, we fall through to the next case.
6968     if (VF.isVector() && isScalarWithPredication(I)) {
6969       unsigned Cost = 0;
6970 
6971       // These instructions have a non-void type, so account for the phi nodes
6972       // that we will create. This cost is likely to be zero. The phi node
6973       // cost, if any, should be scaled by the block probability because it
6974       // models a copy at the end of each predicated block.
6975       Cost += VF.getKnownMinValue() *
6976               TTI.getCFInstrCost(Instruction::PHI, CostKind);
6977 
6978       // The cost of the non-predicated instruction.
6979       Cost += VF.getKnownMinValue() *
6980               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6981 
6982       // The cost of insertelement and extractelement instructions needed for
6983       // scalarization.
6984       Cost += getScalarizationOverhead(I, VF);
6985 
6986       // Scale the cost by the probability of executing the predicated blocks.
6987       // This assumes the predicated block for each vector lane is equally
6988       // likely.
6989       return Cost / getReciprocalPredBlockProb();
6990     }
6991     LLVM_FALLTHROUGH;
6992   case Instruction::Add:
6993   case Instruction::FAdd:
6994   case Instruction::Sub:
6995   case Instruction::FSub:
6996   case Instruction::Mul:
6997   case Instruction::FMul:
6998   case Instruction::FDiv:
6999   case Instruction::FRem:
7000   case Instruction::Shl:
7001   case Instruction::LShr:
7002   case Instruction::AShr:
7003   case Instruction::And:
7004   case Instruction::Or:
7005   case Instruction::Xor: {
7006     // Since we will replace the stride by 1 the multiplication should go away.
7007     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7008       return 0;
7009     // Certain instructions can be cheaper to vectorize if they have a constant
7010     // second vector operand. One example of this are shifts on x86.
7011     Value *Op2 = I->getOperand(1);
7012     TargetTransformInfo::OperandValueProperties Op2VP;
7013     TargetTransformInfo::OperandValueKind Op2VK =
7014         TTI.getOperandInfo(Op2, Op2VP);
7015     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7016       Op2VK = TargetTransformInfo::OK_UniformValue;
7017 
7018     SmallVector<const Value *, 4> Operands(I->operand_values());
7019     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7020     return N * TTI.getArithmeticInstrCost(
7021                    I->getOpcode(), VectorTy, CostKind,
7022                    TargetTransformInfo::OK_AnyValue,
7023                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7024   }
7025   case Instruction::FNeg: {
7026     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7027     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7028     return N * TTI.getArithmeticInstrCost(
7029                    I->getOpcode(), VectorTy, CostKind,
7030                    TargetTransformInfo::OK_AnyValue,
7031                    TargetTransformInfo::OK_AnyValue,
7032                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
7033                    I->getOperand(0), I);
7034   }
7035   case Instruction::Select: {
7036     SelectInst *SI = cast<SelectInst>(I);
7037     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7038     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7039     Type *CondTy = SI->getCondition()->getType();
7040     if (!ScalarCond) {
7041       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7042       CondTy = VectorType::get(CondTy, VF);
7043     }
7044     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7045                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7046   }
7047   case Instruction::ICmp:
7048   case Instruction::FCmp: {
7049     Type *ValTy = I->getOperand(0)->getType();
7050     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7051     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7052       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7053     VectorTy = ToVectorTy(ValTy, VF);
7054     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7055                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7056   }
7057   case Instruction::Store:
7058   case Instruction::Load: {
7059     ElementCount Width = VF;
7060     if (Width.isVector()) {
7061       InstWidening Decision = getWideningDecision(I, Width);
7062       assert(Decision != CM_Unknown &&
7063              "CM decision should be taken at this point");
7064       if (Decision == CM_Scalarize)
7065         Width = ElementCount::getFixed(1);
7066     }
7067     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7068     return getMemoryInstructionCost(I, VF);
7069   }
7070   case Instruction::ZExt:
7071   case Instruction::SExt:
7072   case Instruction::FPToUI:
7073   case Instruction::FPToSI:
7074   case Instruction::FPExt:
7075   case Instruction::PtrToInt:
7076   case Instruction::IntToPtr:
7077   case Instruction::SIToFP:
7078   case Instruction::UIToFP:
7079   case Instruction::Trunc:
7080   case Instruction::FPTrunc:
7081   case Instruction::BitCast: {
7082     // Computes the CastContextHint from a Load/Store instruction.
7083     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7084       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7085              "Expected a load or a store!");
7086 
7087       if (VF.isScalar() || !TheLoop->contains(I))
7088         return TTI::CastContextHint::Normal;
7089 
7090       switch (getWideningDecision(I, VF)) {
7091       case LoopVectorizationCostModel::CM_GatherScatter:
7092         return TTI::CastContextHint::GatherScatter;
7093       case LoopVectorizationCostModel::CM_Interleave:
7094         return TTI::CastContextHint::Interleave;
7095       case LoopVectorizationCostModel::CM_Scalarize:
7096       case LoopVectorizationCostModel::CM_Widen:
7097         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7098                                         : TTI::CastContextHint::Normal;
7099       case LoopVectorizationCostModel::CM_Widen_Reverse:
7100         return TTI::CastContextHint::Reversed;
7101       case LoopVectorizationCostModel::CM_Unknown:
7102         llvm_unreachable("Instr did not go through cost modelling?");
7103       }
7104 
7105       llvm_unreachable("Unhandled case!");
7106     };
7107 
7108     unsigned Opcode = I->getOpcode();
7109     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7110     // For Trunc, the context is the only user, which must be a StoreInst.
7111     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7112       if (I->hasOneUse())
7113         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7114           CCH = ComputeCCH(Store);
7115     }
7116     // For Z/Sext, the context is the operand, which must be a LoadInst.
7117     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7118              Opcode == Instruction::FPExt) {
7119       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7120         CCH = ComputeCCH(Load);
7121     }
7122 
7123     // We optimize the truncation of induction variables having constant
7124     // integer steps. The cost of these truncations is the same as the scalar
7125     // operation.
7126     if (isOptimizableIVTruncate(I, VF)) {
7127       auto *Trunc = cast<TruncInst>(I);
7128       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7129                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7130     }
7131 
7132     Type *SrcScalarTy = I->getOperand(0)->getType();
7133     Type *SrcVecTy =
7134         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7135     if (canTruncateToMinimalBitwidth(I, VF)) {
7136       // This cast is going to be shrunk. This may remove the cast or it might
7137       // turn it into slightly different cast. For example, if MinBW == 16,
7138       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7139       //
7140       // Calculate the modified src and dest types.
7141       Type *MinVecTy = VectorTy;
7142       if (Opcode == Instruction::Trunc) {
7143         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7144         VectorTy =
7145             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7146       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7147         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7148         VectorTy =
7149             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7150       }
7151     }
7152 
7153     assert(!VF.isScalable() && "VF is assumed to be non scalable");
7154     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7155     return N *
7156            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7157   }
7158   case Instruction::Call: {
7159     bool NeedToScalarize;
7160     CallInst *CI = cast<CallInst>(I);
7161     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7162     if (getVectorIntrinsicIDForCall(CI, TLI))
7163       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
7164     return CallCost;
7165   }
7166   case Instruction::ExtractValue:
7167     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7168   default:
7169     // The cost of executing VF copies of the scalar instruction. This opcode
7170     // is unknown. Assume that it is the same as 'mul'.
7171     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
7172                                        Instruction::Mul, VectorTy, CostKind) +
7173            getScalarizationOverhead(I, VF);
7174   } // end of switch.
7175 }
7176 
7177 char LoopVectorize::ID = 0;
7178 
7179 static const char lv_name[] = "Loop Vectorization";
7180 
7181 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7182 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7183 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7184 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7185 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7186 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7187 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7188 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7189 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7190 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7191 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7192 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7193 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7194 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7195 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7196 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7197 
7198 namespace llvm {
7199 
7200 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7201 
7202 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7203                               bool VectorizeOnlyWhenForced) {
7204   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7205 }
7206 
7207 } // end namespace llvm
7208 
7209 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7210   // Check if the pointer operand of a load or store instruction is
7211   // consecutive.
7212   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7213     return Legal->isConsecutivePtr(Ptr);
7214   return false;
7215 }
7216 
7217 void LoopVectorizationCostModel::collectValuesToIgnore() {
7218   // Ignore ephemeral values.
7219   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7220 
7221   // Ignore type-promoting instructions we identified during reduction
7222   // detection.
7223   for (auto &Reduction : Legal->getReductionVars()) {
7224     RecurrenceDescriptor &RedDes = Reduction.second;
7225     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7226     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7227   }
7228   // Ignore type-casting instructions we identified during induction
7229   // detection.
7230   for (auto &Induction : Legal->getInductionVars()) {
7231     InductionDescriptor &IndDes = Induction.second;
7232     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7233     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7234   }
7235 }
7236 
7237 void LoopVectorizationCostModel::collectInLoopReductions() {
7238   for (auto &Reduction : Legal->getReductionVars()) {
7239     PHINode *Phi = Reduction.first;
7240     RecurrenceDescriptor &RdxDesc = Reduction.second;
7241 
7242     // We don't collect reductions that are type promoted (yet).
7243     if (RdxDesc.getRecurrenceType() != Phi->getType())
7244       continue;
7245 
7246     // If the target would prefer this reduction to happen "in-loop", then we
7247     // want to record it as such.
7248     unsigned Opcode = RdxDesc.getRecurrenceBinOp();
7249     if (!PreferInLoopReductions &&
7250         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7251                                    TargetTransformInfo::ReductionFlags()))
7252       continue;
7253 
7254     // Check that we can correctly put the reductions into the loop, by
7255     // finding the chain of operations that leads from the phi to the loop
7256     // exit value.
7257     SmallVector<Instruction *, 4> ReductionOperations =
7258         RdxDesc.getReductionOpChain(Phi, TheLoop);
7259     bool InLoop = !ReductionOperations.empty();
7260     if (InLoop)
7261       InLoopReductionChains[Phi] = ReductionOperations;
7262     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7263                       << " reduction for phi: " << *Phi << "\n");
7264   }
7265 }
7266 
7267 // TODO: we could return a pair of values that specify the max VF and
7268 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7269 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7270 // doesn't have a cost model that can choose which plan to execute if
7271 // more than one is generated.
7272 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7273                                  LoopVectorizationCostModel &CM) {
7274   unsigned WidestType;
7275   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7276   return WidestVectorRegBits / WidestType;
7277 }
7278 
7279 VectorizationFactor
7280 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7281   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7282   ElementCount VF = UserVF;
7283   // Outer loop handling: They may require CFG and instruction level
7284   // transformations before even evaluating whether vectorization is profitable.
7285   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7286   // the vectorization pipeline.
7287   if (!OrigLoop->isInnermost()) {
7288     // If the user doesn't provide a vectorization factor, determine a
7289     // reasonable one.
7290     if (UserVF.isZero()) {
7291       VF = ElementCount::getFixed(
7292           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
7293       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7294 
7295       // Make sure we have a VF > 1 for stress testing.
7296       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7297         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7298                           << "overriding computed VF.\n");
7299         VF = ElementCount::getFixed(4);
7300       }
7301     }
7302     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7303     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7304            "VF needs to be a power of two");
7305     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7306                       << "VF " << VF << " to build VPlans.\n");
7307     buildVPlans(VF, VF);
7308 
7309     // For VPlan build stress testing, we bail out after VPlan construction.
7310     if (VPlanBuildStressTest)
7311       return VectorizationFactor::Disabled();
7312 
7313     return {VF, 0 /*Cost*/};
7314   }
7315 
7316   LLVM_DEBUG(
7317       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7318                 "VPlan-native path.\n");
7319   return VectorizationFactor::Disabled();
7320 }
7321 
7322 Optional<VectorizationFactor>
7323 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7324   assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
7325   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7326   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7327   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7328     return None;
7329 
7330   // Invalidate interleave groups if all blocks of loop will be predicated.
7331   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7332       !useMaskedInterleavedAccesses(*TTI)) {
7333     LLVM_DEBUG(
7334         dbgs()
7335         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7336            "which requires masked-interleaved support.\n");
7337     if (CM.InterleaveInfo.invalidateGroups())
7338       // Invalidating interleave groups also requires invalidating all decisions
7339       // based on them, which includes widening decisions and uniform and scalar
7340       // values.
7341       CM.invalidateCostModelingDecisions();
7342   }
7343 
7344   ElementCount MaxVF = MaybeMaxVF.getValue();
7345   assert(MaxVF.isNonZero() && "MaxVF is zero.");
7346 
7347   if (!UserVF.isZero() && UserVF.getFixedValue() <= MaxVF.getFixedValue()) {
7348     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7349     assert(isPowerOf2_32(UserVF.getFixedValue()) &&
7350            "VF needs to be a power of two");
7351     // Collect the instructions (and their associated costs) that will be more
7352     // profitable to scalarize.
7353     CM.selectUserVectorizationFactor(UserVF);
7354     CM.collectInLoopReductions();
7355     buildVPlansWithVPRecipes(UserVF, UserVF);
7356     LLVM_DEBUG(printPlans(dbgs()));
7357     return {{UserVF, 0}};
7358   }
7359 
7360   for (ElementCount VF = ElementCount::getFixed(1);
7361        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7362     // Collect Uniform and Scalar instructions after vectorization with VF.
7363     CM.collectUniformsAndScalars(VF);
7364 
7365     // Collect the instructions (and their associated costs) that will be more
7366     // profitable to scalarize.
7367     if (VF.isVector())
7368       CM.collectInstsToScalarize(VF);
7369   }
7370 
7371   CM.collectInLoopReductions();
7372 
7373   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7374   LLVM_DEBUG(printPlans(dbgs()));
7375   if (MaxVF.isScalar())
7376     return VectorizationFactor::Disabled();
7377 
7378   // Select the optimal vectorization factor.
7379   return CM.selectVectorizationFactor(MaxVF);
7380 }
7381 
7382 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7383   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7384                     << '\n');
7385   BestVF = VF;
7386   BestUF = UF;
7387 
7388   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7389     return !Plan->hasVF(VF);
7390   });
7391   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7392 }
7393 
7394 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7395                                            DominatorTree *DT) {
7396   // Perform the actual loop transformation.
7397 
7398   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7399   VPCallbackILV CallbackILV(ILV);
7400 
7401   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7402 
7403   VPTransformState State{*BestVF, BestUF,      LI,
7404                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7405                          &ILV,    CallbackILV};
7406   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7407   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7408   State.CanonicalIV = ILV.Induction;
7409 
7410   ILV.printDebugTracesAtStart();
7411 
7412   //===------------------------------------------------===//
7413   //
7414   // Notice: any optimization or new instruction that go
7415   // into the code below should also be implemented in
7416   // the cost-model.
7417   //
7418   //===------------------------------------------------===//
7419 
7420   // 2. Copy and widen instructions from the old loop into the new loop.
7421   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7422   VPlans.front()->execute(&State);
7423 
7424   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7425   //    predication, updating analyses.
7426   ILV.fixVectorizedLoop();
7427 
7428   ILV.printDebugTracesAtEnd();
7429 }
7430 
7431 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7432     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7433   BasicBlock *Latch = OrigLoop->getLoopLatch();
7434 
7435   // We create new control-flow for the vectorized loop, so the original
7436   // condition will be dead after vectorization if it's only used by the
7437   // branch.
7438   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
7439   if (Cmp && Cmp->hasOneUse()) {
7440     DeadInstructions.insert(Cmp);
7441 
7442     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7443     for (Value *Op : Cmp->operands()) {
7444       if (isa<TruncInst>(Op) && Op->hasOneUse())
7445           DeadInstructions.insert(cast<Instruction>(Op));
7446     }
7447   }
7448 
7449   // We create new "steps" for induction variable updates to which the original
7450   // induction variables map. An original update instruction will be dead if
7451   // all its users except the induction variable are dead.
7452   for (auto &Induction : Legal->getInductionVars()) {
7453     PHINode *Ind = Induction.first;
7454     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7455 
7456     // If the tail is to be folded by masking, the primary induction variable,
7457     // if exists, isn't dead: it will be used for masking. Don't kill it.
7458     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7459       continue;
7460 
7461     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7462           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7463         }))
7464       DeadInstructions.insert(IndUpdate);
7465 
7466     // We record as "Dead" also the type-casting instructions we had identified
7467     // during induction analysis. We don't need any handling for them in the
7468     // vectorized loop because we have proven that, under a proper runtime
7469     // test guarding the vectorized loop, the value of the phi, and the casted
7470     // value of the phi, are the same. The last instruction in this casting chain
7471     // will get its scalar/vector/widened def from the scalar/vector/widened def
7472     // of the respective phi node. Any other casts in the induction def-use chain
7473     // have no other uses outside the phi update chain, and will be ignored.
7474     InductionDescriptor &IndDes = Induction.second;
7475     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7476     DeadInstructions.insert(Casts.begin(), Casts.end());
7477   }
7478 }
7479 
7480 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7481 
7482 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7483 
7484 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7485                                         Instruction::BinaryOps BinOp) {
7486   // When unrolling and the VF is 1, we only need to add a simple scalar.
7487   Type *Ty = Val->getType();
7488   assert(!Ty->isVectorTy() && "Val must be a scalar");
7489 
7490   if (Ty->isFloatingPointTy()) {
7491     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7492 
7493     // Floating point operations had to be 'fast' to enable the unrolling.
7494     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7495     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7496   }
7497   Constant *C = ConstantInt::get(Ty, StartIdx);
7498   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7499 }
7500 
7501 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7502   SmallVector<Metadata *, 4> MDs;
7503   // Reserve first location for self reference to the LoopID metadata node.
7504   MDs.push_back(nullptr);
7505   bool IsUnrollMetadata = false;
7506   MDNode *LoopID = L->getLoopID();
7507   if (LoopID) {
7508     // First find existing loop unrolling disable metadata.
7509     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7510       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7511       if (MD) {
7512         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7513         IsUnrollMetadata =
7514             S && S->getString().startswith("llvm.loop.unroll.disable");
7515       }
7516       MDs.push_back(LoopID->getOperand(i));
7517     }
7518   }
7519 
7520   if (!IsUnrollMetadata) {
7521     // Add runtime unroll disable metadata.
7522     LLVMContext &Context = L->getHeader()->getContext();
7523     SmallVector<Metadata *, 1> DisableOperands;
7524     DisableOperands.push_back(
7525         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7526     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7527     MDs.push_back(DisableNode);
7528     MDNode *NewLoopID = MDNode::get(Context, MDs);
7529     // Set operand 0 to refer to the loop id itself.
7530     NewLoopID->replaceOperandWith(0, NewLoopID);
7531     L->setLoopID(NewLoopID);
7532   }
7533 }
7534 
7535 //===--------------------------------------------------------------------===//
7536 // EpilogueVectorizerMainLoop
7537 //===--------------------------------------------------------------------===//
7538 
7539 /// This function is partially responsible for generating the control flow
7540 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7541 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7542   MDNode *OrigLoopID = OrigLoop->getLoopID();
7543   Loop *Lp = createVectorLoopSkeleton("");
7544 
7545   // Generate the code to check the minimum iteration count of the vector
7546   // epilogue (see below).
7547   EPI.EpilogueIterationCountCheck =
7548       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
7549   EPI.EpilogueIterationCountCheck->setName("iter.check");
7550 
7551   // Generate the code to check any assumptions that we've made for SCEV
7552   // expressions.
7553   BasicBlock *SavedPreHeader = LoopVectorPreHeader;
7554   emitSCEVChecks(Lp, LoopScalarPreHeader);
7555 
7556   // If a safety check was generated save it.
7557   if (SavedPreHeader != LoopVectorPreHeader)
7558     EPI.SCEVSafetyCheck = SavedPreHeader;
7559 
7560   // Generate the code that checks at runtime if arrays overlap. We put the
7561   // checks into a separate block to make the more common case of few elements
7562   // faster.
7563   SavedPreHeader = LoopVectorPreHeader;
7564   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
7565 
7566   // If a safety check was generated save/overwite it.
7567   if (SavedPreHeader != LoopVectorPreHeader)
7568     EPI.MemSafetyCheck = SavedPreHeader;
7569 
7570   // Generate the iteration count check for the main loop, *after* the check
7571   // for the epilogue loop, so that the path-length is shorter for the case
7572   // that goes directly through the vector epilogue. The longer-path length for
7573   // the main loop is compensated for, by the gain from vectorizing the larger
7574   // trip count. Note: the branch will get updated later on when we vectorize
7575   // the epilogue.
7576   EPI.MainLoopIterationCountCheck =
7577       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
7578 
7579   // Generate the induction variable.
7580   OldInduction = Legal->getPrimaryInduction();
7581   Type *IdxTy = Legal->getWidestInductionType();
7582   Value *StartIdx = ConstantInt::get(IdxTy, 0);
7583   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7584   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7585   EPI.VectorTripCount = CountRoundDown;
7586   Induction =
7587       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7588                               getDebugLocFromInstOrOperands(OldInduction));
7589 
7590   // Skip induction resume value creation here because they will be created in
7591   // the second pass. If we created them here, they wouldn't be used anyway,
7592   // because the vplan in the second pass still contains the inductions from the
7593   // original loop.
7594 
7595   return completeLoopSkeleton(Lp, OrigLoopID);
7596 }
7597 
7598 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7599   LLVM_DEBUG({
7600     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7601            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7602            << ", Main Loop UF:" << EPI.MainLoopUF
7603            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7604            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7605   });
7606 }
7607 
7608 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7609   DEBUG_WITH_TYPE(VerboseDebug, {
7610     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
7611   });
7612 }
7613 
7614 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
7615     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
7616   assert(L && "Expected valid Loop.");
7617   assert(Bypass && "Expected valid bypass basic block.");
7618   unsigned VFactor =
7619       ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
7620   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7621   Value *Count = getOrCreateTripCount(L);
7622   // Reuse existing vector loop preheader for TC checks.
7623   // Note that new preheader block is generated for vector loop.
7624   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7625   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7626 
7627   // Generate code to check if the loop's trip count is less than VF * UF of the
7628   // main vector loop.
7629   auto P =
7630       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7631 
7632   Value *CheckMinIters = Builder.CreateICmp(
7633       P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
7634       "min.iters.check");
7635 
7636   if (!ForEpilogue)
7637     TCCheckBlock->setName("vector.main.loop.iter.check");
7638 
7639   // Create new preheader for vector loop.
7640   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7641                                    DT, LI, nullptr, "vector.ph");
7642 
7643   if (ForEpilogue) {
7644     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7645                                  DT->getNode(Bypass)->getIDom()) &&
7646            "TC check is expected to dominate Bypass");
7647 
7648     // Update dominator for Bypass & LoopExit.
7649     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7650     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7651 
7652     LoopBypassBlocks.push_back(TCCheckBlock);
7653 
7654     // Save the trip count so we don't have to regenerate it in the
7655     // vec.epilog.iter.check. This is safe to do because the trip count
7656     // generated here dominates the vector epilog iter check.
7657     EPI.TripCount = Count;
7658   }
7659 
7660   ReplaceInstWithInst(
7661       TCCheckBlock->getTerminator(),
7662       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7663 
7664   return TCCheckBlock;
7665 }
7666 
7667 //===--------------------------------------------------------------------===//
7668 // EpilogueVectorizerEpilogueLoop
7669 //===--------------------------------------------------------------------===//
7670 
7671 /// This function is partially responsible for generating the control flow
7672 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7673 BasicBlock *
7674 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7675   MDNode *OrigLoopID = OrigLoop->getLoopID();
7676   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
7677 
7678   // Now, compare the remaining count and if there aren't enough iterations to
7679   // execute the vectorized epilogue skip to the scalar part.
7680   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7681   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7682   LoopVectorPreHeader =
7683       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7684                  LI, nullptr, "vec.epilog.ph");
7685   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
7686                                           VecEpilogueIterationCountCheck);
7687 
7688   // Adjust the control flow taking the state info from the main loop
7689   // vectorization into account.
7690   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7691          "expected this to be saved from the previous pass.");
7692   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7693       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7694 
7695   DT->changeImmediateDominator(LoopVectorPreHeader,
7696                                EPI.MainLoopIterationCountCheck);
7697 
7698   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7699       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7700 
7701   if (EPI.SCEVSafetyCheck)
7702     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7703         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7704   if (EPI.MemSafetyCheck)
7705     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7706         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7707 
7708   DT->changeImmediateDominator(
7709       VecEpilogueIterationCountCheck,
7710       VecEpilogueIterationCountCheck->getSinglePredecessor());
7711 
7712   DT->changeImmediateDominator(LoopScalarPreHeader,
7713                                EPI.EpilogueIterationCountCheck);
7714   DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
7715 
7716   // Keep track of bypass blocks, as they feed start values to the induction
7717   // phis in the scalar loop preheader.
7718   if (EPI.SCEVSafetyCheck)
7719     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7720   if (EPI.MemSafetyCheck)
7721     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7722   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7723 
7724   // Generate a resume induction for the vector epilogue and put it in the
7725   // vector epilogue preheader
7726   Type *IdxTy = Legal->getWidestInductionType();
7727   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7728                                          LoopVectorPreHeader->getFirstNonPHI());
7729   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7730   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7731                            EPI.MainLoopIterationCountCheck);
7732 
7733   // Generate the induction variable.
7734   OldInduction = Legal->getPrimaryInduction();
7735   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7736   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7737   Value *StartIdx = EPResumeVal;
7738   Induction =
7739       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7740                               getDebugLocFromInstOrOperands(OldInduction));
7741 
7742   // Generate induction resume values. These variables save the new starting
7743   // indexes for the scalar loop. They are used to test if there are any tail
7744   // iterations left once the vector loop has completed.
7745   // Note that when the vectorized epilogue is skipped due to iteration count
7746   // check, then the resume value for the induction variable comes from
7747   // the trip count of the main vector loop, hence passing the AdditionalBypass
7748   // argument.
7749   createInductionResumeValues(Lp, CountRoundDown,
7750                               {VecEpilogueIterationCountCheck,
7751                                EPI.VectorTripCount} /* AdditionalBypass */);
7752 
7753   AddRuntimeUnrollDisableMetaData(Lp);
7754   return completeLoopSkeleton(Lp, OrigLoopID);
7755 }
7756 
7757 BasicBlock *
7758 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7759     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
7760 
7761   assert(EPI.TripCount &&
7762          "Expected trip count to have been safed in the first pass.");
7763   assert(
7764       (!isa<Instruction>(EPI.TripCount) ||
7765        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7766       "saved trip count does not dominate insertion point.");
7767   Value *TC = EPI.TripCount;
7768   IRBuilder<> Builder(Insert->getTerminator());
7769   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7770 
7771   // Generate code to check if the loop's trip count is less than VF * UF of the
7772   // vector epilogue loop.
7773   auto P =
7774       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7775 
7776   Value *CheckMinIters = Builder.CreateICmp(
7777       P, Count,
7778       ConstantInt::get(Count->getType(),
7779                        EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
7780       "min.epilog.iters.check");
7781 
7782   ReplaceInstWithInst(
7783       Insert->getTerminator(),
7784       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7785 
7786   LoopBypassBlocks.push_back(Insert);
7787   return Insert;
7788 }
7789 
7790 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7791   LLVM_DEBUG({
7792     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7793            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7794            << ", Main Loop UF:" << EPI.MainLoopUF
7795            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7796            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7797   });
7798 }
7799 
7800 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7801   DEBUG_WITH_TYPE(VerboseDebug, {
7802     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
7803   });
7804 }
7805 
7806 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7807     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7808   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7809   bool PredicateAtRangeStart = Predicate(Range.Start);
7810 
7811   for (ElementCount TmpVF = Range.Start * 2;
7812        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7813     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7814       Range.End = TmpVF;
7815       break;
7816     }
7817 
7818   return PredicateAtRangeStart;
7819 }
7820 
7821 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7822 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7823 /// of VF's starting at a given VF and extending it as much as possible. Each
7824 /// vectorization decision can potentially shorten this sub-range during
7825 /// buildVPlan().
7826 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7827                                            ElementCount MaxVF) {
7828   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7829   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7830     VFRange SubRange = {VF, MaxVFPlusOne};
7831     VPlans.push_back(buildVPlan(SubRange));
7832     VF = SubRange.End;
7833   }
7834 }
7835 
7836 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7837                                          VPlanPtr &Plan) {
7838   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7839 
7840   // Look for cached value.
7841   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7842   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7843   if (ECEntryIt != EdgeMaskCache.end())
7844     return ECEntryIt->second;
7845 
7846   VPValue *SrcMask = createBlockInMask(Src, Plan);
7847 
7848   // The terminator has to be a branch inst!
7849   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7850   assert(BI && "Unexpected terminator found");
7851 
7852   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7853     return EdgeMaskCache[Edge] = SrcMask;
7854 
7855   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
7856   assert(EdgeMask && "No Edge Mask found for condition");
7857 
7858   if (BI->getSuccessor(0) != Dst)
7859     EdgeMask = Builder.createNot(EdgeMask);
7860 
7861   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7862     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7863 
7864   return EdgeMaskCache[Edge] = EdgeMask;
7865 }
7866 
7867 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7868   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7869 
7870   // Look for cached value.
7871   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7872   if (BCEntryIt != BlockMaskCache.end())
7873     return BCEntryIt->second;
7874 
7875   // All-one mask is modelled as no-mask following the convention for masked
7876   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7877   VPValue *BlockMask = nullptr;
7878 
7879   if (OrigLoop->getHeader() == BB) {
7880     if (!CM.blockNeedsPredication(BB))
7881       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7882 
7883     // Create the block in mask as the first non-phi instruction in the block.
7884     VPBuilder::InsertPointGuard Guard(Builder);
7885     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
7886     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
7887 
7888     // Introduce the early-exit compare IV <= BTC to form header block mask.
7889     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7890     // Start by constructing the desired canonical IV.
7891     VPValue *IV = nullptr;
7892     if (Legal->getPrimaryInduction())
7893       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
7894     else {
7895       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7896       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
7897       IV = IVRecipe->getVPValue();
7898     }
7899     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7900     bool TailFolded = !CM.isScalarEpilogueAllowed();
7901 
7902     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7903       // While ActiveLaneMask is a binary op that consumes the loop tripcount
7904       // as a second argument, we only pass the IV here and extract the
7905       // tripcount from the transform state where codegen of the VP instructions
7906       // happen.
7907       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7908     } else {
7909       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7910     }
7911     return BlockMaskCache[BB] = BlockMask;
7912   }
7913 
7914   // This is the block mask. We OR all incoming edges.
7915   for (auto *Predecessor : predecessors(BB)) {
7916     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7917     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7918       return BlockMaskCache[BB] = EdgeMask;
7919 
7920     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7921       BlockMask = EdgeMask;
7922       continue;
7923     }
7924 
7925     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7926   }
7927 
7928   return BlockMaskCache[BB] = BlockMask;
7929 }
7930 
7931 VPWidenMemoryInstructionRecipe *
7932 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7933                                   VPlanPtr &Plan) {
7934   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7935          "Must be called with either a load or store");
7936 
7937   auto willWiden = [&](ElementCount VF) -> bool {
7938     assert(!VF.isScalable() && "unexpected scalable ElementCount");
7939     if (VF.isScalar())
7940       return false;
7941     LoopVectorizationCostModel::InstWidening Decision =
7942         CM.getWideningDecision(I, VF);
7943     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7944            "CM decision should be taken at this point.");
7945     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7946       return true;
7947     if (CM.isScalarAfterVectorization(I, VF) ||
7948         CM.isProfitableToScalarize(I, VF))
7949       return false;
7950     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7951   };
7952 
7953   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7954     return nullptr;
7955 
7956   VPValue *Mask = nullptr;
7957   if (Legal->isMaskRequired(I))
7958     Mask = createBlockInMask(I->getParent(), Plan);
7959 
7960   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7961   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7962     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7963 
7964   StoreInst *Store = cast<StoreInst>(I);
7965   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7966   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7967 }
7968 
7969 VPWidenIntOrFpInductionRecipe *
7970 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
7971   // Check if this is an integer or fp induction. If so, build the recipe that
7972   // produces its scalar and vector values.
7973   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
7974   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
7975       II.getKind() == InductionDescriptor::IK_FpInduction)
7976     return new VPWidenIntOrFpInductionRecipe(Phi);
7977 
7978   return nullptr;
7979 }
7980 
7981 VPWidenIntOrFpInductionRecipe *
7982 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
7983                                                 VFRange &Range) const {
7984   // Optimize the special case where the source is a constant integer
7985   // induction variable. Notice that we can only optimize the 'trunc' case
7986   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7987   // (c) other casts depend on pointer size.
7988 
7989   // Determine whether \p K is a truncation based on an induction variable that
7990   // can be optimized.
7991   auto isOptimizableIVTruncate =
7992       [&](Instruction *K) -> std::function<bool(ElementCount)> {
7993     return [=](ElementCount VF) -> bool {
7994       return CM.isOptimizableIVTruncate(K, VF);
7995     };
7996   };
7997 
7998   if (LoopVectorizationPlanner::getDecisionAndClampRange(
7999           isOptimizableIVTruncate(I), Range))
8000     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8001                                              I);
8002   return nullptr;
8003 }
8004 
8005 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
8006   // We know that all PHIs in non-header blocks are converted into selects, so
8007   // we don't have to worry about the insertion order and we can just use the
8008   // builder. At this point we generate the predication tree. There may be
8009   // duplications since this is a simple recursive scan, but future
8010   // optimizations will clean it up.
8011 
8012   SmallVector<VPValue *, 2> Operands;
8013   unsigned NumIncoming = Phi->getNumIncomingValues();
8014   for (unsigned In = 0; In < NumIncoming; In++) {
8015     VPValue *EdgeMask =
8016       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8017     assert((EdgeMask || NumIncoming == 1) &&
8018            "Multiple predecessors with one having a full mask");
8019     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
8020     if (EdgeMask)
8021       Operands.push_back(EdgeMask);
8022   }
8023   return new VPBlendRecipe(Phi, Operands);
8024 }
8025 
8026 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
8027                                                    VPlan &Plan) const {
8028 
8029   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8030       [this, CI](ElementCount VF) {
8031         return CM.isScalarWithPredication(CI, VF);
8032       },
8033       Range);
8034 
8035   if (IsPredicated)
8036     return nullptr;
8037 
8038   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8039   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8040              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8041              ID == Intrinsic::pseudoprobe))
8042     return nullptr;
8043 
8044   auto willWiden = [&](ElementCount VF) -> bool {
8045     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8046     // The following case may be scalarized depending on the VF.
8047     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8048     // version of the instruction.
8049     // Is it beneficial to perform intrinsic call compared to lib call?
8050     bool NeedToScalarize = false;
8051     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8052     bool UseVectorIntrinsic =
8053         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
8054     return UseVectorIntrinsic || !NeedToScalarize;
8055   };
8056 
8057   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8058     return nullptr;
8059 
8060   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
8061 }
8062 
8063 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8064   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8065          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8066   // Instruction should be widened, unless it is scalar after vectorization,
8067   // scalarization is profitable or it is predicated.
8068   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8069     return CM.isScalarAfterVectorization(I, VF) ||
8070            CM.isProfitableToScalarize(I, VF) ||
8071            CM.isScalarWithPredication(I, VF);
8072   };
8073   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8074                                                              Range);
8075 }
8076 
8077 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
8078   auto IsVectorizableOpcode = [](unsigned Opcode) {
8079     switch (Opcode) {
8080     case Instruction::Add:
8081     case Instruction::And:
8082     case Instruction::AShr:
8083     case Instruction::BitCast:
8084     case Instruction::FAdd:
8085     case Instruction::FCmp:
8086     case Instruction::FDiv:
8087     case Instruction::FMul:
8088     case Instruction::FNeg:
8089     case Instruction::FPExt:
8090     case Instruction::FPToSI:
8091     case Instruction::FPToUI:
8092     case Instruction::FPTrunc:
8093     case Instruction::FRem:
8094     case Instruction::FSub:
8095     case Instruction::ICmp:
8096     case Instruction::IntToPtr:
8097     case Instruction::LShr:
8098     case Instruction::Mul:
8099     case Instruction::Or:
8100     case Instruction::PtrToInt:
8101     case Instruction::SDiv:
8102     case Instruction::Select:
8103     case Instruction::SExt:
8104     case Instruction::Shl:
8105     case Instruction::SIToFP:
8106     case Instruction::SRem:
8107     case Instruction::Sub:
8108     case Instruction::Trunc:
8109     case Instruction::UDiv:
8110     case Instruction::UIToFP:
8111     case Instruction::URem:
8112     case Instruction::Xor:
8113     case Instruction::ZExt:
8114       return true;
8115     }
8116     return false;
8117   };
8118 
8119   if (!IsVectorizableOpcode(I->getOpcode()))
8120     return nullptr;
8121 
8122   // Success: widen this instruction.
8123   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
8124 }
8125 
8126 VPBasicBlock *VPRecipeBuilder::handleReplication(
8127     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8128     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
8129     VPlanPtr &Plan) {
8130   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8131       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8132       Range);
8133 
8134   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8135       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
8136       Range);
8137 
8138   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8139                                        IsUniform, IsPredicated);
8140   setRecipe(I, Recipe);
8141   Plan->addVPValue(I, Recipe);
8142 
8143   // Find if I uses a predicated instruction. If so, it will use its scalar
8144   // value. Avoid hoisting the insert-element which packs the scalar value into
8145   // a vector value, as that happens iff all users use the vector value.
8146   for (auto &Op : I->operands())
8147     if (auto *PredInst = dyn_cast<Instruction>(Op))
8148       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
8149         PredInst2Recipe[PredInst]->setAlsoPack(false);
8150 
8151   // Finalize the recipe for Instr, first if it is not predicated.
8152   if (!IsPredicated) {
8153     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8154     VPBB->appendRecipe(Recipe);
8155     return VPBB;
8156   }
8157   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8158   assert(VPBB->getSuccessors().empty() &&
8159          "VPBB has successors when handling predicated replication.");
8160   // Record predicated instructions for above packing optimizations.
8161   PredInst2Recipe[I] = Recipe;
8162   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8163   VPBlockUtils::insertBlockAfter(Region, VPBB);
8164   auto *RegSucc = new VPBasicBlock();
8165   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8166   return RegSucc;
8167 }
8168 
8169 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8170                                                       VPRecipeBase *PredRecipe,
8171                                                       VPlanPtr &Plan) {
8172   // Instructions marked for predication are replicated and placed under an
8173   // if-then construct to prevent side-effects.
8174 
8175   // Generate recipes to compute the block mask for this region.
8176   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8177 
8178   // Build the triangular if-then region.
8179   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8180   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8181   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8182   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8183   auto *PHIRecipe = Instr->getType()->isVoidTy()
8184                         ? nullptr
8185                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8186   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8187   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8188   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8189 
8190   // Note: first set Entry as region entry and then connect successors starting
8191   // from it in order, to propagate the "parent" of each VPBasicBlock.
8192   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8193   VPBlockUtils::connectBlocks(Pred, Exit);
8194 
8195   return Region;
8196 }
8197 
8198 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8199                                                       VFRange &Range,
8200                                                       VPlanPtr &Plan) {
8201   // First, check for specific widening recipes that deal with calls, memory
8202   // operations, inductions and Phi nodes.
8203   if (auto *CI = dyn_cast<CallInst>(Instr))
8204     return tryToWidenCall(CI, Range, *Plan);
8205 
8206   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8207     return tryToWidenMemory(Instr, Range, Plan);
8208 
8209   VPRecipeBase *Recipe;
8210   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8211     if (Phi->getParent() != OrigLoop->getHeader())
8212       return tryToBlend(Phi, Plan);
8213     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
8214       return Recipe;
8215     return new VPWidenPHIRecipe(Phi);
8216   }
8217 
8218   if (isa<TruncInst>(Instr) &&
8219       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
8220     return Recipe;
8221 
8222   if (!shouldWiden(Instr, Range))
8223     return nullptr;
8224 
8225   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8226     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
8227                                 OrigLoop);
8228 
8229   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8230     bool InvariantCond =
8231         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8232     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
8233                                    InvariantCond);
8234   }
8235 
8236   return tryToWiden(Instr, *Plan);
8237 }
8238 
8239 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8240                                                         ElementCount MaxVF) {
8241   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8242 
8243   // Collect instructions from the original loop that will become trivially dead
8244   // in the vectorized loop. We don't need to vectorize these instructions. For
8245   // example, original induction update instructions can become dead because we
8246   // separately emit induction "steps" when generating code for the new loop.
8247   // Similarly, we create a new latch condition when setting up the structure
8248   // of the new loop, so the old one can become dead.
8249   SmallPtrSet<Instruction *, 4> DeadInstructions;
8250   collectTriviallyDeadInstructions(DeadInstructions);
8251 
8252   // Add assume instructions we need to drop to DeadInstructions, to prevent
8253   // them from being added to the VPlan.
8254   // TODO: We only need to drop assumes in blocks that get flattend. If the
8255   // control flow is preserved, we should keep them.
8256   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8257   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8258 
8259   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8260   // Dead instructions do not need sinking. Remove them from SinkAfter.
8261   for (Instruction *I : DeadInstructions)
8262     SinkAfter.erase(I);
8263 
8264   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8265   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8266     VFRange SubRange = {VF, MaxVFPlusOne};
8267     VPlans.push_back(
8268         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8269     VF = SubRange.End;
8270   }
8271 }
8272 
8273 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8274     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8275     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
8276 
8277   // Hold a mapping from predicated instructions to their recipes, in order to
8278   // fix their AlsoPack behavior if a user is determined to replicate and use a
8279   // scalar instead of vector value.
8280   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
8281 
8282   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8283 
8284   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8285 
8286   // ---------------------------------------------------------------------------
8287   // Pre-construction: record ingredients whose recipes we'll need to further
8288   // process after constructing the initial VPlan.
8289   // ---------------------------------------------------------------------------
8290 
8291   // Mark instructions we'll need to sink later and their targets as
8292   // ingredients whose recipe we'll need to record.
8293   for (auto &Entry : SinkAfter) {
8294     RecipeBuilder.recordRecipeOf(Entry.first);
8295     RecipeBuilder.recordRecipeOf(Entry.second);
8296   }
8297   for (auto &Reduction : CM.getInLoopReductionChains()) {
8298     PHINode *Phi = Reduction.first;
8299     RecurrenceDescriptor::RecurrenceKind Kind =
8300         Legal->getReductionVars()[Phi].getRecurrenceKind();
8301     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8302 
8303     RecipeBuilder.recordRecipeOf(Phi);
8304     for (auto &R : ReductionOperations) {
8305       RecipeBuilder.recordRecipeOf(R);
8306       // For min/max reducitons, where we have a pair of icmp/select, we also
8307       // need to record the ICmp recipe, so it can be removed later.
8308       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8309           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8310         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8311       }
8312     }
8313   }
8314 
8315   // For each interleave group which is relevant for this (possibly trimmed)
8316   // Range, add it to the set of groups to be later applied to the VPlan and add
8317   // placeholders for its members' Recipes which we'll be replacing with a
8318   // single VPInterleaveRecipe.
8319   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8320     auto applyIG = [IG, this](ElementCount VF) -> bool {
8321       return (VF.isVector() && // Query is illegal for VF == 1
8322               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8323                   LoopVectorizationCostModel::CM_Interleave);
8324     };
8325     if (!getDecisionAndClampRange(applyIG, Range))
8326       continue;
8327     InterleaveGroups.insert(IG);
8328     for (unsigned i = 0; i < IG->getFactor(); i++)
8329       if (Instruction *Member = IG->getMember(i))
8330         RecipeBuilder.recordRecipeOf(Member);
8331   };
8332 
8333   // ---------------------------------------------------------------------------
8334   // Build initial VPlan: Scan the body of the loop in a topological order to
8335   // visit each basic block after having visited its predecessor basic blocks.
8336   // ---------------------------------------------------------------------------
8337 
8338   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
8339   auto Plan = std::make_unique<VPlan>();
8340   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
8341   Plan->setEntry(VPBB);
8342 
8343   // Scan the body of the loop in a topological order to visit each basic block
8344   // after having visited its predecessor basic blocks.
8345   LoopBlocksDFS DFS(OrigLoop);
8346   DFS.perform(LI);
8347 
8348   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8349     // Relevant instructions from basic block BB will be grouped into VPRecipe
8350     // ingredients and fill a new VPBasicBlock.
8351     unsigned VPBBsForBB = 0;
8352     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8353     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
8354     VPBB = FirstVPBBForBB;
8355     Builder.setInsertPoint(VPBB);
8356 
8357     // Introduce each ingredient into VPlan.
8358     // TODO: Model and preserve debug instrinsics in VPlan.
8359     for (Instruction &I : BB->instructionsWithoutDebug()) {
8360       Instruction *Instr = &I;
8361 
8362       // First filter out irrelevant instructions, to ensure no recipes are
8363       // built for them.
8364       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8365         continue;
8366 
8367       if (auto Recipe =
8368               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
8369         // Check if the recipe can be converted to a VPValue. We need the extra
8370         // down-casting step until VPRecipeBase inherits from VPValue.
8371         VPValue *MaybeVPValue = Recipe->toVPValue();
8372         if (!Instr->getType()->isVoidTy() && MaybeVPValue)
8373           Plan->addVPValue(Instr, MaybeVPValue);
8374 
8375         RecipeBuilder.setRecipe(Instr, Recipe);
8376         VPBB->appendRecipe(Recipe);
8377         continue;
8378       }
8379 
8380       // Otherwise, if all widening options failed, Instruction is to be
8381       // replicated. This may create a successor for VPBB.
8382       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
8383           Instr, Range, VPBB, PredInst2Recipe, Plan);
8384       if (NextVPBB != VPBB) {
8385         VPBB = NextVPBB;
8386         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8387                                     : "");
8388       }
8389     }
8390   }
8391 
8392   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8393   // may also be empty, such as the last one VPBB, reflecting original
8394   // basic-blocks with no recipes.
8395   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8396   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
8397   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8398   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
8399   delete PreEntry;
8400 
8401   // ---------------------------------------------------------------------------
8402   // Transform initial VPlan: Apply previously taken decisions, in order, to
8403   // bring the VPlan to its final state.
8404   // ---------------------------------------------------------------------------
8405 
8406   // Apply Sink-After legal constraints.
8407   for (auto &Entry : SinkAfter) {
8408     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8409     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8410     Sink->moveAfter(Target);
8411   }
8412 
8413   // Interleave memory: for each Interleave Group we marked earlier as relevant
8414   // for this VPlan, replace the Recipes widening its memory instructions with a
8415   // single VPInterleaveRecipe at its insertion point.
8416   for (auto IG : InterleaveGroups) {
8417     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8418         RecipeBuilder.getRecipe(IG->getInsertPos()));
8419     SmallVector<VPValue *, 4> StoredValues;
8420     for (unsigned i = 0; i < IG->getFactor(); ++i)
8421       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
8422         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
8423 
8424     (new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8425                             Recipe->getMask()))
8426         ->insertBefore(Recipe);
8427 
8428     for (unsigned i = 0; i < IG->getFactor(); ++i)
8429       if (Instruction *Member = IG->getMember(i)) {
8430         if (!Member->getType()->isVoidTy()) {
8431           VPValue *OriginalV = Plan->getVPValue(Member);
8432           Plan->removeVPValueFor(Member);
8433           OriginalV->replaceAllUsesWith(Plan->getOrAddVPValue(Member));
8434         }
8435         RecipeBuilder.getRecipe(Member)->eraseFromParent();
8436       }
8437   }
8438 
8439   // Adjust the recipes for any inloop reductions.
8440   if (Range.Start.isVector())
8441     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
8442 
8443   // Finally, if tail is folded by masking, introduce selects between the phi
8444   // and the live-out instruction of each reduction, at the end of the latch.
8445   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
8446     Builder.setInsertPoint(VPBB);
8447     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
8448     for (auto &Reduction : Legal->getReductionVars()) {
8449       if (CM.isInLoopReduction(Reduction.first))
8450         continue;
8451       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
8452       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
8453       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
8454     }
8455   }
8456 
8457   std::string PlanName;
8458   raw_string_ostream RSO(PlanName);
8459   ElementCount VF = Range.Start;
8460   Plan->addVF(VF);
8461   RSO << "Initial VPlan for VF={" << VF;
8462   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
8463     Plan->addVF(VF);
8464     RSO << "," << VF;
8465   }
8466   RSO << "},UF>=1";
8467   RSO.flush();
8468   Plan->setName(PlanName);
8469 
8470   return Plan;
8471 }
8472 
8473 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8474   // Outer loop handling: They may require CFG and instruction level
8475   // transformations before even evaluating whether vectorization is profitable.
8476   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8477   // the vectorization pipeline.
8478   assert(!OrigLoop->isInnermost());
8479   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8480 
8481   // Create new empty VPlan
8482   auto Plan = std::make_unique<VPlan>();
8483 
8484   // Build hierarchical CFG
8485   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8486   HCFGBuilder.buildHierarchicalCFG();
8487 
8488   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
8489        VF *= 2)
8490     Plan->addVF(VF);
8491 
8492   if (EnableVPlanPredication) {
8493     VPlanPredicator VPP(*Plan);
8494     VPP.predicate();
8495 
8496     // Avoid running transformation to recipes until masked code generation in
8497     // VPlan-native path is in place.
8498     return Plan;
8499   }
8500 
8501   SmallPtrSet<Instruction *, 1> DeadInstructions;
8502   VPlanTransforms::VPInstructionsToVPRecipes(
8503       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
8504   return Plan;
8505 }
8506 
8507 // Adjust the recipes for any inloop reductions. The chain of instructions
8508 // leading from the loop exit instr to the phi need to be converted to
8509 // reductions, with one operand being vector and the other being the scalar
8510 // reduction chain.
8511 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
8512     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
8513   for (auto &Reduction : CM.getInLoopReductionChains()) {
8514     PHINode *Phi = Reduction.first;
8515     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8516     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8517 
8518     // ReductionOperations are orders top-down from the phi's use to the
8519     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
8520     // which of the two operands will remain scalar and which will be reduced.
8521     // For minmax the chain will be the select instructions.
8522     Instruction *Chain = Phi;
8523     for (Instruction *R : ReductionOperations) {
8524       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
8525       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
8526 
8527       VPValue *ChainOp = Plan->getVPValue(Chain);
8528       unsigned FirstOpId;
8529       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8530           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8531         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
8532                "Expected to replace a VPWidenSelectSC");
8533         FirstOpId = 1;
8534       } else {
8535         assert(isa<VPWidenRecipe>(WidenRecipe) &&
8536                "Expected to replace a VPWidenSC");
8537         FirstOpId = 0;
8538       }
8539       unsigned VecOpId =
8540           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
8541       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
8542 
8543       auto *CondOp = CM.foldTailByMasking()
8544                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
8545                          : nullptr;
8546       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
8547           &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
8548       WidenRecipe->toVPValue()->replaceAllUsesWith(RedRecipe);
8549       Plan->removeVPValueFor(R);
8550       Plan->addVPValue(R, RedRecipe);
8551       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
8552       WidenRecipe->eraseFromParent();
8553 
8554       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8555           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8556         VPRecipeBase *CompareRecipe =
8557             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
8558         assert(isa<VPWidenRecipe>(CompareRecipe) &&
8559                "Expected to replace a VPWidenSC");
8560         assert(CompareRecipe->toVPValue()->getNumUsers() == 0 &&
8561                "Expected no remaining users");
8562         CompareRecipe->eraseFromParent();
8563       }
8564       Chain = R;
8565     }
8566   }
8567 }
8568 
8569 Value* LoopVectorizationPlanner::VPCallbackILV::
8570 getOrCreateVectorValues(Value *V, unsigned Part) {
8571       return ILV.getOrCreateVectorValue(V, Part);
8572 }
8573 
8574 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
8575     Value *V, const VPIteration &Instance) {
8576   return ILV.getOrCreateScalarValue(V, Instance);
8577 }
8578 
8579 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
8580                                VPSlotTracker &SlotTracker) const {
8581   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
8582   IG->getInsertPos()->printAsOperand(O, false);
8583   O << ", ";
8584   getAddr()->printAsOperand(O, SlotTracker);
8585   VPValue *Mask = getMask();
8586   if (Mask) {
8587     O << ", ";
8588     Mask->printAsOperand(O, SlotTracker);
8589   }
8590   for (unsigned i = 0; i < IG->getFactor(); ++i)
8591     if (Instruction *I = IG->getMember(i))
8592       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
8593 }
8594 
8595 void VPWidenCallRecipe::execute(VPTransformState &State) {
8596   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
8597                                   *this, State);
8598 }
8599 
8600 void VPWidenSelectRecipe::execute(VPTransformState &State) {
8601   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
8602                                     this, *this, InvariantCond, State);
8603 }
8604 
8605 void VPWidenRecipe::execute(VPTransformState &State) {
8606   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
8607 }
8608 
8609 void VPWidenGEPRecipe::execute(VPTransformState &State) {
8610   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
8611                       *this, State.UF, State.VF, IsPtrLoopInvariant,
8612                       IsIndexLoopInvariant, State);
8613 }
8614 
8615 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
8616   assert(!State.Instance && "Int or FP induction being replicated.");
8617   State.ILV->widenIntOrFpInduction(IV, Trunc);
8618 }
8619 
8620 void VPWidenPHIRecipe::execute(VPTransformState &State) {
8621   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
8622 }
8623 
8624 void VPBlendRecipe::execute(VPTransformState &State) {
8625   State.ILV->setDebugLocFromInst(State.Builder, Phi);
8626   // We know that all PHIs in non-header blocks are converted into
8627   // selects, so we don't have to worry about the insertion order and we
8628   // can just use the builder.
8629   // At this point we generate the predication tree. There may be
8630   // duplications since this is a simple recursive scan, but future
8631   // optimizations will clean it up.
8632 
8633   unsigned NumIncoming = getNumIncomingValues();
8634 
8635   // Generate a sequence of selects of the form:
8636   // SELECT(Mask3, In3,
8637   //        SELECT(Mask2, In2,
8638   //               SELECT(Mask1, In1,
8639   //                      In0)))
8640   // Note that Mask0 is never used: lanes for which no path reaches this phi and
8641   // are essentially undef are taken from In0.
8642   InnerLoopVectorizer::VectorParts Entry(State.UF);
8643   for (unsigned In = 0; In < NumIncoming; ++In) {
8644     for (unsigned Part = 0; Part < State.UF; ++Part) {
8645       // We might have single edge PHIs (blocks) - use an identity
8646       // 'select' for the first PHI operand.
8647       Value *In0 = State.get(getIncomingValue(In), Part);
8648       if (In == 0)
8649         Entry[Part] = In0; // Initialize with the first incoming value.
8650       else {
8651         // Select between the current value and the previous incoming edge
8652         // based on the incoming mask.
8653         Value *Cond = State.get(getMask(In), Part);
8654         Entry[Part] =
8655             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8656       }
8657     }
8658   }
8659   for (unsigned Part = 0; Part < State.UF; ++Part)
8660     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8661 }
8662 
8663 void VPInterleaveRecipe::execute(VPTransformState &State) {
8664   assert(!State.Instance && "Interleave group being replicated.");
8665   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getStoredValues(),
8666                                       getMask());
8667 }
8668 
8669 void VPReductionRecipe::execute(VPTransformState &State) {
8670   assert(!State.Instance && "Reduction being replicated.");
8671   for (unsigned Part = 0; Part < State.UF; ++Part) {
8672     RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind();
8673     Value *NewVecOp = State.get(getVecOp(), Part);
8674     if (VPValue *Cond = getCondOp()) {
8675       Value *NewCond = State.get(Cond, Part);
8676       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
8677       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
8678           Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType());
8679       Constant *IdenVec =
8680           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
8681       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
8682       NewVecOp = Select;
8683     }
8684     Value *NewRed =
8685         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
8686     Value *PrevInChain = State.get(getChainOp(), Part);
8687     Value *NextInChain;
8688     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8689         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8690       NextInChain =
8691           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
8692                          NewRed, PrevInChain);
8693     } else {
8694       NextInChain = State.Builder.CreateBinOp(
8695           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
8696           PrevInChain);
8697     }
8698     State.set(this, getUnderlyingInstr(), NextInChain, Part);
8699   }
8700 }
8701 
8702 void VPReplicateRecipe::execute(VPTransformState &State) {
8703   if (State.Instance) { // Generate a single instance.
8704     State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
8705                                     *State.Instance, IsPredicated, State);
8706     // Insert scalar instance packing it into a vector.
8707     if (AlsoPack && State.VF.isVector()) {
8708       // If we're constructing lane 0, initialize to start from undef.
8709       if (State.Instance->Lane == 0) {
8710         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8711         Value *Undef = UndefValue::get(
8712             VectorType::get(getUnderlyingValue()->getType(), State.VF));
8713         State.ValueMap.setVectorValue(getUnderlyingInstr(),
8714                                       State.Instance->Part, Undef);
8715       }
8716       State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
8717                                            *State.Instance);
8718     }
8719     return;
8720   }
8721 
8722   // Generate scalar instances for all VF lanes of all UF parts, unless the
8723   // instruction is uniform inwhich case generate only the first lane for each
8724   // of the UF parts.
8725   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8726   for (unsigned Part = 0; Part < State.UF; ++Part)
8727     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8728       State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
8729                                       IsPredicated, State);
8730 }
8731 
8732 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8733   assert(State.Instance && "Branch on Mask works only on single instance.");
8734 
8735   unsigned Part = State.Instance->Part;
8736   unsigned Lane = State.Instance->Lane;
8737 
8738   Value *ConditionBit = nullptr;
8739   VPValue *BlockInMask = getMask();
8740   if (BlockInMask) {
8741     ConditionBit = State.get(BlockInMask, Part);
8742     if (ConditionBit->getType()->isVectorTy())
8743       ConditionBit = State.Builder.CreateExtractElement(
8744           ConditionBit, State.Builder.getInt32(Lane));
8745   } else // Block in mask is all-one.
8746     ConditionBit = State.Builder.getTrue();
8747 
8748   // Replace the temporary unreachable terminator with a new conditional branch,
8749   // whose two destinations will be set later when they are created.
8750   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8751   assert(isa<UnreachableInst>(CurrentTerminator) &&
8752          "Expected to replace unreachable terminator with conditional branch.");
8753   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8754   CondBr->setSuccessor(0, nullptr);
8755   ReplaceInstWithInst(CurrentTerminator, CondBr);
8756 }
8757 
8758 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8759   assert(State.Instance && "Predicated instruction PHI works per instance.");
8760   Instruction *ScalarPredInst =
8761       cast<Instruction>(State.get(getOperand(0), *State.Instance));
8762   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8763   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8764   assert(PredicatingBB && "Predicated block has no single predecessor.");
8765 
8766   // By current pack/unpack logic we need to generate only a single phi node: if
8767   // a vector value for the predicated instruction exists at this point it means
8768   // the instruction has vector users only, and a phi for the vector value is
8769   // needed. In this case the recipe of the predicated instruction is marked to
8770   // also do that packing, thereby "hoisting" the insert-element sequence.
8771   // Otherwise, a phi node for the scalar value is needed.
8772   unsigned Part = State.Instance->Part;
8773   Instruction *PredInst =
8774       cast<Instruction>(getOperand(0)->getUnderlyingValue());
8775   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8776     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8777     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8778     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8779     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8780     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8781     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8782   } else {
8783     Type *PredInstType = PredInst->getType();
8784     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8785     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8786     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8787     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8788   }
8789 }
8790 
8791 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8792   Instruction *Instr = getUnderlyingInstr();
8793   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
8794   State.ILV->vectorizeMemoryInstruction(Instr, State,
8795                                         StoredValue ? nullptr : this, getAddr(),
8796                                         StoredValue, getMask());
8797 }
8798 
8799 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8800 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8801 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8802 // for predication.
8803 static ScalarEpilogueLowering getScalarEpilogueLowering(
8804     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8805     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8806     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8807     LoopVectorizationLegality &LVL) {
8808   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8809   // don't look at hints or options, and don't request a scalar epilogue.
8810   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8811   // LoopAccessInfo (due to code dependency and not being able to reliably get
8812   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8813   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8814   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8815   // back to the old way and vectorize with versioning when forced. See D81345.)
8816   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8817                                                       PGSOQueryType::IRPass) &&
8818                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8819     return CM_ScalarEpilogueNotAllowedOptSize;
8820 
8821   bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() &&
8822                               !PreferPredicateOverEpilogue;
8823 
8824   // 2) Next, if disabling predication is requested on the command line, honour
8825   // this and request a scalar epilogue.
8826   if (PredicateOptDisabled)
8827     return CM_ScalarEpilogueAllowed;
8828 
8829   // 3) and 4) look if enabling predication is requested on the command line,
8830   // with a loop hint, or if the TTI hook indicates this is profitable, request
8831   // predication.
8832   if (PreferPredicateOverEpilogue ||
8833       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
8834       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8835                                         LVL.getLAI()) &&
8836        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
8837     return CM_ScalarEpilogueNotNeededUsePredicate;
8838 
8839   return CM_ScalarEpilogueAllowed;
8840 }
8841 
8842 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
8843                            unsigned Part) {
8844   set(Def, V, Part);
8845   ILV->setVectorValue(IRDef, Part, V);
8846 }
8847 
8848 // Process the loop in the VPlan-native vectorization path. This path builds
8849 // VPlan upfront in the vectorization pipeline, which allows to apply
8850 // VPlan-to-VPlan transformations from the very beginning without modifying the
8851 // input LLVM IR.
8852 static bool processLoopInVPlanNativePath(
8853     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8854     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8855     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8856     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8857     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8858 
8859   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
8860     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8861     return false;
8862   }
8863   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8864   Function *F = L->getHeader()->getParent();
8865   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8866 
8867   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8868       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8869 
8870   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8871                                 &Hints, IAI);
8872   // Use the planner for outer loop vectorization.
8873   // TODO: CM is not used at this point inside the planner. Turn CM into an
8874   // optional argument if we don't need it in the future.
8875   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8876 
8877   // Get user vectorization factor.
8878   ElementCount UserVF = Hints.getWidth();
8879   if (UserVF.isScalable()) {
8880     // TODO: Use scalable UserVF once we've added initial support for scalable
8881     // vectorization. For now we convert it to fixed width, but this will be
8882     // removed in a later patch.
8883     UserVF = ElementCount::getFixed(UserVF.getKnownMinValue());
8884   }
8885 
8886   // Plan how to best vectorize, return the best VF and its cost.
8887   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
8888 
8889   // If we are stress testing VPlan builds, do not attempt to generate vector
8890   // code. Masked vector code generation support will follow soon.
8891   // Also, do not attempt to vectorize if no vector code will be produced.
8892   if (VPlanBuildStressTest || EnableVPlanPredication ||
8893       VectorizationFactor::Disabled() == VF)
8894     return false;
8895 
8896   LVP.setBestPlan(VF.Width, 1);
8897 
8898   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8899                          &CM, BFI, PSI);
8900   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8901                     << L->getHeader()->getParent()->getName() << "\"\n");
8902   LVP.executePlan(LB, DT);
8903 
8904   // Mark the loop as already vectorized to avoid vectorizing again.
8905   Hints.setAlreadyVectorized();
8906 
8907   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8908   return true;
8909 }
8910 
8911 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8912     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8913                                !EnableLoopInterleaving),
8914       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8915                               !EnableLoopVectorization) {}
8916 
8917 bool LoopVectorizePass::processLoop(Loop *L) {
8918   assert((EnableVPlanNativePath || L->isInnermost()) &&
8919          "VPlan-native path is not enabled. Only process inner loops.");
8920 
8921 #ifndef NDEBUG
8922   const std::string DebugLocStr = getDebugLocString(L);
8923 #endif /* NDEBUG */
8924 
8925   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8926                     << L->getHeader()->getParent()->getName() << "\" from "
8927                     << DebugLocStr << "\n");
8928 
8929   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8930 
8931   LLVM_DEBUG(
8932       dbgs() << "LV: Loop hints:"
8933              << " force="
8934              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8935                      ? "disabled"
8936                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8937                             ? "enabled"
8938                             : "?"))
8939              << " width=" << Hints.getWidth()
8940              << " unroll=" << Hints.getInterleave() << "\n");
8941 
8942   // Function containing loop
8943   Function *F = L->getHeader()->getParent();
8944 
8945   // Looking at the diagnostic output is the only way to determine if a loop
8946   // was vectorized (other than looking at the IR or machine code), so it
8947   // is important to generate an optimization remark for each loop. Most of
8948   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8949   // generated as OptimizationRemark and OptimizationRemarkMissed are
8950   // less verbose reporting vectorized loops and unvectorized loops that may
8951   // benefit from vectorization, respectively.
8952 
8953   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8954     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8955     return false;
8956   }
8957 
8958   PredicatedScalarEvolution PSE(*SE, *L);
8959 
8960   // Check if it is legal to vectorize the loop.
8961   LoopVectorizationRequirements Requirements(*ORE);
8962   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
8963                                 &Requirements, &Hints, DB, AC, BFI, PSI);
8964   if (!LVL.canVectorize(EnableVPlanNativePath)) {
8965     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8966     Hints.emitRemarkWithHints();
8967     return false;
8968   }
8969 
8970   // Check the function attributes and profiles to find out if this function
8971   // should be optimized for size.
8972   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8973       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
8974 
8975   // Entrance to the VPlan-native vectorization path. Outer loops are processed
8976   // here. They may require CFG and instruction level transformations before
8977   // even evaluating whether vectorization is profitable. Since we cannot modify
8978   // the incoming IR, we need to build VPlan upfront in the vectorization
8979   // pipeline.
8980   if (!L->isInnermost())
8981     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
8982                                         ORE, BFI, PSI, Hints);
8983 
8984   assert(L->isInnermost() && "Inner loop expected.");
8985 
8986   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8987   // count by optimizing for size, to minimize overheads.
8988   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
8989   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
8990     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
8991                       << "This loop is worth vectorizing only if no scalar "
8992                       << "iteration overheads are incurred.");
8993     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
8994       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
8995     else {
8996       LLVM_DEBUG(dbgs() << "\n");
8997       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
8998     }
8999   }
9000 
9001   // Check the function attributes to see if implicit floats are allowed.
9002   // FIXME: This check doesn't seem possibly correct -- what if the loop is
9003   // an integer loop and the vector instructions selected are purely integer
9004   // vector instructions?
9005   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9006     reportVectorizationFailure(
9007         "Can't vectorize when the NoImplicitFloat attribute is used",
9008         "loop not vectorized due to NoImplicitFloat attribute",
9009         "NoImplicitFloat", ORE, L);
9010     Hints.emitRemarkWithHints();
9011     return false;
9012   }
9013 
9014   // Check if the target supports potentially unsafe FP vectorization.
9015   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9016   // for the target we're vectorizing for, to make sure none of the
9017   // additional fp-math flags can help.
9018   if (Hints.isPotentiallyUnsafe() &&
9019       TTI->isFPVectorizationPotentiallyUnsafe()) {
9020     reportVectorizationFailure(
9021         "Potentially unsafe FP op prevents vectorization",
9022         "loop not vectorized due to unsafe FP support.",
9023         "UnsafeFP", ORE, L);
9024     Hints.emitRemarkWithHints();
9025     return false;
9026   }
9027 
9028   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9029   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9030 
9031   // If an override option has been passed in for interleaved accesses, use it.
9032   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9033     UseInterleaved = EnableInterleavedMemAccesses;
9034 
9035   // Analyze interleaved memory accesses.
9036   if (UseInterleaved) {
9037     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9038   }
9039 
9040   // Use the cost model.
9041   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9042                                 F, &Hints, IAI);
9043   CM.collectValuesToIgnore();
9044 
9045   // Use the planner for vectorization.
9046   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
9047 
9048   // Get user vectorization factor and interleave count.
9049   ElementCount UserVF = Hints.getWidth();
9050   if (UserVF.isScalable()) {
9051     // TODO: Use scalable UserVF once we've added initial support for scalable
9052     // vectorization. For now we convert it to fixed width, but this will be
9053     // removed in a later patch.
9054     UserVF = ElementCount::getFixed(UserVF.getKnownMinValue());
9055   }
9056 
9057   unsigned UserIC = Hints.getInterleave();
9058 
9059   // Plan how to best vectorize, return the best VF and its cost.
9060   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9061 
9062   VectorizationFactor VF = VectorizationFactor::Disabled();
9063   unsigned IC = 1;
9064 
9065   if (MaybeVF) {
9066     VF = *MaybeVF;
9067     // Select the interleave count.
9068     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9069   }
9070 
9071   // Identify the diagnostic messages that should be produced.
9072   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9073   bool VectorizeLoop = true, InterleaveLoop = true;
9074   if (Requirements.doesNotMeet(F, L, Hints)) {
9075     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
9076                          "requirements.\n");
9077     Hints.emitRemarkWithHints();
9078     return false;
9079   }
9080 
9081   if (VF.Width.isScalar()) {
9082     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9083     VecDiagMsg = std::make_pair(
9084         "VectorizationNotBeneficial",
9085         "the cost-model indicates that vectorization is not beneficial");
9086     VectorizeLoop = false;
9087   }
9088 
9089   if (!MaybeVF && UserIC > 1) {
9090     // Tell the user interleaving was avoided up-front, despite being explicitly
9091     // requested.
9092     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9093                          "interleaving should be avoided up front\n");
9094     IntDiagMsg = std::make_pair(
9095         "InterleavingAvoided",
9096         "Ignoring UserIC, because interleaving was avoided up front");
9097     InterleaveLoop = false;
9098   } else if (IC == 1 && UserIC <= 1) {
9099     // Tell the user interleaving is not beneficial.
9100     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9101     IntDiagMsg = std::make_pair(
9102         "InterleavingNotBeneficial",
9103         "the cost-model indicates that interleaving is not beneficial");
9104     InterleaveLoop = false;
9105     if (UserIC == 1) {
9106       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9107       IntDiagMsg.second +=
9108           " and is explicitly disabled or interleave count is set to 1";
9109     }
9110   } else if (IC > 1 && UserIC == 1) {
9111     // Tell the user interleaving is beneficial, but it explicitly disabled.
9112     LLVM_DEBUG(
9113         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9114     IntDiagMsg = std::make_pair(
9115         "InterleavingBeneficialButDisabled",
9116         "the cost-model indicates that interleaving is beneficial "
9117         "but is explicitly disabled or interleave count is set to 1");
9118     InterleaveLoop = false;
9119   }
9120 
9121   // Override IC if user provided an interleave count.
9122   IC = UserIC > 0 ? UserIC : IC;
9123 
9124   // Emit diagnostic messages, if any.
9125   const char *VAPassName = Hints.vectorizeAnalysisPassName();
9126   if (!VectorizeLoop && !InterleaveLoop) {
9127     // Do not vectorize or interleaving the loop.
9128     ORE->emit([&]() {
9129       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9130                                       L->getStartLoc(), L->getHeader())
9131              << VecDiagMsg.second;
9132     });
9133     ORE->emit([&]() {
9134       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9135                                       L->getStartLoc(), L->getHeader())
9136              << IntDiagMsg.second;
9137     });
9138     return false;
9139   } else if (!VectorizeLoop && InterleaveLoop) {
9140     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9141     ORE->emit([&]() {
9142       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9143                                         L->getStartLoc(), L->getHeader())
9144              << VecDiagMsg.second;
9145     });
9146   } else if (VectorizeLoop && !InterleaveLoop) {
9147     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9148                       << ") in " << DebugLocStr << '\n');
9149     ORE->emit([&]() {
9150       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9151                                         L->getStartLoc(), L->getHeader())
9152              << IntDiagMsg.second;
9153     });
9154   } else if (VectorizeLoop && InterleaveLoop) {
9155     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9156                       << ") in " << DebugLocStr << '\n');
9157     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9158   }
9159 
9160   LVP.setBestPlan(VF.Width, IC);
9161 
9162   using namespace ore;
9163   bool DisableRuntimeUnroll = false;
9164   MDNode *OrigLoopID = L->getLoopID();
9165 
9166   if (!VectorizeLoop) {
9167     assert(IC > 1 && "interleave count should not be 1 or 0");
9168     // If we decided that it is not legal to vectorize the loop, then
9169     // interleave it.
9170     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
9171                                BFI, PSI);
9172     LVP.executePlan(Unroller, DT);
9173 
9174     ORE->emit([&]() {
9175       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9176                                 L->getHeader())
9177              << "interleaved loop (interleaved count: "
9178              << NV("InterleaveCount", IC) << ")";
9179     });
9180   } else {
9181     // If we decided that it is *legal* to vectorize the loop, then do it.
9182 
9183     // Consider vectorizing the epilogue too if it's profitable.
9184     VectorizationFactor EpilogueVF =
9185       CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
9186     if (EpilogueVF.Width.isVector()) {
9187 
9188       // The first pass vectorizes the main loop and creates a scalar epilogue
9189       // to be vectorized by executing the plan (potentially with a different
9190       // factor) again shortly afterwards.
9191       EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
9192                                         EpilogueVF.Width.getKnownMinValue(), 1);
9193       EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
9194                                          &LVL, &CM, BFI, PSI);
9195 
9196       LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
9197       LVP.executePlan(MainILV, DT);
9198       ++LoopsVectorized;
9199 
9200       simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9201       formLCSSARecursively(*L, *DT, LI, SE);
9202 
9203       // Second pass vectorizes the epilogue and adjusts the control flow
9204       // edges from the first pass.
9205       LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
9206       EPI.MainLoopVF = EPI.EpilogueVF;
9207       EPI.MainLoopUF = EPI.EpilogueUF;
9208       EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9209                                                ORE, EPI, &LVL, &CM, BFI, PSI);
9210       LVP.executePlan(EpilogILV, DT);
9211       ++LoopsEpilogueVectorized;
9212 
9213       if (!MainILV.areSafetyChecksAdded())
9214         DisableRuntimeUnroll = true;
9215     } else {
9216       InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
9217                              &LVL, &CM, BFI, PSI);
9218       LVP.executePlan(LB, DT);
9219       ++LoopsVectorized;
9220 
9221       // Add metadata to disable runtime unrolling a scalar loop when there are
9222       // no runtime checks about strides and memory. A scalar loop that is
9223       // rarely used is not worth unrolling.
9224       if (!LB.areSafetyChecksAdded())
9225         DisableRuntimeUnroll = true;
9226     }
9227 
9228     // Report the vectorization decision.
9229     ORE->emit([&]() {
9230       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
9231                                 L->getHeader())
9232              << "vectorized loop (vectorization width: "
9233              << NV("VectorizationFactor", VF.Width)
9234              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
9235     });
9236   }
9237 
9238   Optional<MDNode *> RemainderLoopID =
9239       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
9240                                       LLVMLoopVectorizeFollowupEpilogue});
9241   if (RemainderLoopID.hasValue()) {
9242     L->setLoopID(RemainderLoopID.getValue());
9243   } else {
9244     if (DisableRuntimeUnroll)
9245       AddRuntimeUnrollDisableMetaData(L);
9246 
9247     // Mark the loop as already vectorized to avoid vectorizing again.
9248     Hints.setAlreadyVectorized();
9249   }
9250 
9251   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9252   return true;
9253 }
9254 
9255 LoopVectorizeResult LoopVectorizePass::runImpl(
9256     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
9257     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
9258     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
9259     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
9260     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
9261   SE = &SE_;
9262   LI = &LI_;
9263   TTI = &TTI_;
9264   DT = &DT_;
9265   BFI = &BFI_;
9266   TLI = TLI_;
9267   AA = &AA_;
9268   AC = &AC_;
9269   GetLAA = &GetLAA_;
9270   DB = &DB_;
9271   ORE = &ORE_;
9272   PSI = PSI_;
9273 
9274   // Don't attempt if
9275   // 1. the target claims to have no vector registers, and
9276   // 2. interleaving won't help ILP.
9277   //
9278   // The second condition is necessary because, even if the target has no
9279   // vector registers, loop vectorization may still enable scalar
9280   // interleaving.
9281   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9282       TTI->getMaxInterleaveFactor(1) < 2)
9283     return LoopVectorizeResult(false, false);
9284 
9285   bool Changed = false, CFGChanged = false;
9286 
9287   // The vectorizer requires loops to be in simplified form.
9288   // Since simplification may add new inner loops, it has to run before the
9289   // legality and profitability checks. This means running the loop vectorizer
9290   // will simplify all loops, regardless of whether anything end up being
9291   // vectorized.
9292   for (auto &L : *LI)
9293     Changed |= CFGChanged |=
9294         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9295 
9296   // Build up a worklist of inner-loops to vectorize. This is necessary as
9297   // the act of vectorizing or partially unrolling a loop creates new loops
9298   // and can invalidate iterators across the loops.
9299   SmallVector<Loop *, 8> Worklist;
9300 
9301   for (Loop *L : *LI)
9302     collectSupportedLoops(*L, LI, ORE, Worklist);
9303 
9304   LoopsAnalyzed += Worklist.size();
9305 
9306   // Now walk the identified inner loops.
9307   while (!Worklist.empty()) {
9308     Loop *L = Worklist.pop_back_val();
9309 
9310     // For the inner loops we actually process, form LCSSA to simplify the
9311     // transform.
9312     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
9313 
9314     Changed |= CFGChanged |= processLoop(L);
9315   }
9316 
9317   // Process each loop nest in the function.
9318   return LoopVectorizeResult(Changed, CFGChanged);
9319 }
9320 
9321 PreservedAnalyses LoopVectorizePass::run(Function &F,
9322                                          FunctionAnalysisManager &AM) {
9323     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
9324     auto &LI = AM.getResult<LoopAnalysis>(F);
9325     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
9326     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
9327     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
9328     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
9329     auto &AA = AM.getResult<AAManager>(F);
9330     auto &AC = AM.getResult<AssumptionAnalysis>(F);
9331     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
9332     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
9333     MemorySSA *MSSA = EnableMSSALoopDependency
9334                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
9335                           : nullptr;
9336 
9337     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
9338     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
9339         [&](Loop &L) -> const LoopAccessInfo & {
9340       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
9341                                         TLI, TTI, nullptr, MSSA};
9342       return LAM.getResult<LoopAccessAnalysis>(L, AR);
9343     };
9344     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
9345     ProfileSummaryInfo *PSI =
9346         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
9347     LoopVectorizeResult Result =
9348         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
9349     if (!Result.MadeAnyChange)
9350       return PreservedAnalyses::all();
9351     PreservedAnalyses PA;
9352 
9353     // We currently do not preserve loopinfo/dominator analyses with outer loop
9354     // vectorization. Until this is addressed, mark these analyses as preserved
9355     // only for non-VPlan-native path.
9356     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
9357     if (!EnableVPlanNativePath) {
9358       PA.preserve<LoopAnalysis>();
9359       PA.preserve<DominatorTreeAnalysis>();
9360     }
9361     PA.preserve<BasicAA>();
9362     PA.preserve<GlobalsAA>();
9363     if (!Result.MadeCFGChange)
9364       PA.preserveSet<CFGAnalyses>();
9365     return PA;
9366 }
9367