1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 #ifndef NDEBUG
161 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
162 #endif
163 
164 /// @{
165 /// Metadata attribute names
166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167 const char LLVMLoopVectorizeFollowupVectorized[] =
168     "llvm.loop.vectorize.followup_vectorized";
169 const char LLVMLoopVectorizeFollowupEpilogue[] =
170     "llvm.loop.vectorize.followup_epilogue";
171 /// @}
172 
173 STATISTIC(LoopsVectorized, "Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176 
177 static cl::opt<bool> EnableEpilogueVectorization(
178     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179     cl::desc("Enable vectorization of epilogue loops."));
180 
181 static cl::opt<unsigned> EpilogueVectorizationForceVF(
182     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183     cl::desc("When epilogue vectorization is enabled, and a value greater than "
184              "1 is specified, forces the given VF for all applicable epilogue "
185              "loops."));
186 
187 static cl::opt<unsigned> EpilogueVectorizationMinVF(
188     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189     cl::desc("Only loops with vectorization factor equal to or larger than "
190              "the specified value are considered for epilogue vectorization."));
191 
192 /// Loops with a known constant trip count below this number are vectorized only
193 /// if no scalar iteration overheads are incurred.
194 static cl::opt<unsigned> TinyTripCountVectorThreshold(
195     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196     cl::desc("Loops with a constant trip count that is smaller than this "
197              "value are vectorized only if no scalar iteration overheads "
198              "are incurred."));
199 
200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
201 // that predication is preferred, and this lists all options. I.e., the
202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
203 // and predicate the instructions accordingly. If tail-folding fails, there are
204 // different fallback strategies depending on these values:
205 namespace PreferPredicateTy {
206   enum Option {
207     ScalarEpilogue = 0,
208     PredicateElseScalarEpilogue,
209     PredicateOrDontVectorize
210   };
211 } // namespace PreferPredicateTy
212 
213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
214     "prefer-predicate-over-epilogue",
215     cl::init(PreferPredicateTy::ScalarEpilogue),
216     cl::Hidden,
217     cl::desc("Tail-folding and predication preferences over creating a scalar "
218              "epilogue loop."),
219     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
220                          "scalar-epilogue",
221                          "Don't tail-predicate loops, create scalar epilogue"),
222               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
223                          "predicate-else-scalar-epilogue",
224                          "prefer tail-folding, create scalar epilogue if tail "
225                          "folding fails."),
226               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
227                          "predicate-dont-vectorize",
228                          "prefers tail-folding, don't attempt vectorization if "
229                          "tail-folding fails.")));
230 
231 static cl::opt<bool> MaximizeBandwidth(
232     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
233     cl::desc("Maximize bandwidth when selecting vectorization factor which "
234              "will be determined by the smallest type in loop."));
235 
236 static cl::opt<bool> EnableInterleavedMemAccesses(
237     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
238     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
239 
240 /// An interleave-group may need masking if it resides in a block that needs
241 /// predication, or in order to mask away gaps.
242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
243     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
245 
246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
247     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
248     cl::desc("We don't interleave loops with a estimated constant trip count "
249              "below this number"));
250 
251 static cl::opt<unsigned> ForceTargetNumScalarRegs(
252     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
253     cl::desc("A flag that overrides the target's number of scalar registers."));
254 
255 static cl::opt<unsigned> ForceTargetNumVectorRegs(
256     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of vector registers."));
258 
259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
260     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's max interleave factor for "
262              "scalar loops."));
263 
264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
265     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
266     cl::desc("A flag that overrides the target's max interleave factor for "
267              "vectorized loops."));
268 
269 static cl::opt<unsigned> ForceTargetInstructionCost(
270     "force-target-instruction-cost", cl::init(0), cl::Hidden,
271     cl::desc("A flag that overrides the target's expected cost for "
272              "an instruction to a single constant value. Mostly "
273              "useful for getting consistent testing."));
274 
275 static cl::opt<unsigned> SmallLoopCost(
276     "small-loop-cost", cl::init(20), cl::Hidden,
277     cl::desc(
278         "The cost of a loop that is considered 'small' by the interleaver."));
279 
280 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
281     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
282     cl::desc("Enable the use of the block frequency analysis to access PGO "
283              "heuristics minimizing code growth in cold regions and being more "
284              "aggressive in hot regions."));
285 
286 // Runtime interleave loops for load/store throughput.
287 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
288     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
289     cl::desc(
290         "Enable runtime interleaving until load/store ports are saturated"));
291 
292 /// Interleave small loops with scalar reductions.
293 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
294     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
295     cl::desc("Enable interleaving for loops with small iteration counts that "
296              "contain scalar reductions to expose ILP."));
297 
298 /// The number of stores in a loop that are allowed to need predication.
299 static cl::opt<unsigned> NumberOfStoresToPredicate(
300     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
301     cl::desc("Max number of stores to be predicated behind an if."));
302 
303 static cl::opt<bool> EnableIndVarRegisterHeur(
304     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
305     cl::desc("Count the induction variable only once when interleaving"));
306 
307 static cl::opt<bool> EnableCondStoresVectorization(
308     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
309     cl::desc("Enable if predication of stores during vectorization."));
310 
311 static cl::opt<unsigned> MaxNestedScalarReductionIC(
312     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
313     cl::desc("The maximum interleave count to use when interleaving a scalar "
314              "reduction in a nested loop."));
315 
316 static cl::opt<bool>
317     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
318                            cl::Hidden,
319                            cl::desc("Prefer in-loop vector reductions, "
320                                     "overriding the targets preference."));
321 
322 static cl::opt<bool> PreferPredicatedReductionSelect(
323     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
324     cl::desc(
325         "Prefer predicating a reduction operation over an after loop select."));
326 
327 cl::opt<bool> EnableVPlanNativePath(
328     "enable-vplan-native-path", cl::init(false), cl::Hidden,
329     cl::desc("Enable VPlan-native vectorization path with "
330              "support for outer loop vectorization."));
331 
332 // FIXME: Remove this switch once we have divergence analysis. Currently we
333 // assume divergent non-backedge branches when this switch is true.
334 cl::opt<bool> EnableVPlanPredication(
335     "enable-vplan-predication", cl::init(false), cl::Hidden,
336     cl::desc("Enable VPlan-native vectorization path predicator with "
337              "support for outer loop vectorization."));
338 
339 // This flag enables the stress testing of the VPlan H-CFG construction in the
340 // VPlan-native vectorization path. It must be used in conjuction with
341 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
342 // verification of the H-CFGs built.
343 static cl::opt<bool> VPlanBuildStressTest(
344     "vplan-build-stress-test", cl::init(false), cl::Hidden,
345     cl::desc(
346         "Build VPlan for every supported loop nest in the function and bail "
347         "out right after the build (stress test the VPlan H-CFG construction "
348         "in the VPlan-native vectorization path)."));
349 
350 cl::opt<bool> llvm::EnableLoopInterleaving(
351     "interleave-loops", cl::init(true), cl::Hidden,
352     cl::desc("Enable loop interleaving in Loop vectorization passes"));
353 cl::opt<bool> llvm::EnableLoopVectorization(
354     "vectorize-loops", cl::init(true), cl::Hidden,
355     cl::desc("Run the Loop vectorization passes"));
356 
357 /// A helper function that returns the type of loaded or stored value.
358 static Type *getMemInstValueType(Value *I) {
359   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
360          "Expected Load or Store instruction");
361   if (auto *LI = dyn_cast<LoadInst>(I))
362     return LI->getType();
363   return cast<StoreInst>(I)->getValueOperand()->getType();
364 }
365 
366 /// A helper function that returns true if the given type is irregular. The
367 /// type is irregular if its allocated size doesn't equal the store size of an
368 /// element of the corresponding vector type at the given vectorization factor.
369 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
370   // Determine if an array of VF elements of type Ty is "bitcast compatible"
371   // with a <VF x Ty> vector.
372   if (VF.isVector()) {
373     auto *VectorTy = VectorType::get(Ty, VF);
374     return TypeSize::get(VF.getKnownMinValue() *
375                              DL.getTypeAllocSize(Ty).getFixedValue(),
376                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
377   }
378 
379   // If the vectorization factor is one, we just check if an array of type Ty
380   // requires padding between elements.
381   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
382 }
383 
384 /// A helper function that returns the reciprocal of the block probability of
385 /// predicated blocks. If we return X, we are assuming the predicated block
386 /// will execute once for every X iterations of the loop header.
387 ///
388 /// TODO: We should use actual block probability here, if available. Currently,
389 ///       we always assume predicated blocks have a 50% chance of executing.
390 static unsigned getReciprocalPredBlockProb() { return 2; }
391 
392 /// A helper function that adds a 'fast' flag to floating-point operations.
393 static Value *addFastMathFlag(Value *V) {
394   if (isa<FPMathOperator>(V))
395     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
396   return V;
397 }
398 
399 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
400   if (isa<FPMathOperator>(V))
401     cast<Instruction>(V)->setFastMathFlags(FMF);
402   return V;
403 }
404 
405 /// A helper function that returns an integer or floating-point constant with
406 /// value C.
407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
408   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
409                            : ConstantFP::get(Ty, C);
410 }
411 
412 /// Returns "best known" trip count for the specified loop \p L as defined by
413 /// the following procedure:
414 ///   1) Returns exact trip count if it is known.
415 ///   2) Returns expected trip count according to profile data if any.
416 ///   3) Returns upper bound estimate if it is known.
417 ///   4) Returns None if all of the above failed.
418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
419   // Check if exact trip count is known.
420   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
421     return ExpectedTC;
422 
423   // Check if there is an expected trip count available from profile data.
424   if (LoopVectorizeWithBlockFrequency)
425     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
426       return EstimatedTC;
427 
428   // Check if upper bound estimate is known.
429   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
430     return ExpectedTC;
431 
432   return None;
433 }
434 
435 namespace llvm {
436 
437 /// InnerLoopVectorizer vectorizes loops which contain only one basic
438 /// block to a specified vectorization factor (VF).
439 /// This class performs the widening of scalars into vectors, or multiple
440 /// scalars. This class also implements the following features:
441 /// * It inserts an epilogue loop for handling loops that don't have iteration
442 ///   counts that are known to be a multiple of the vectorization factor.
443 /// * It handles the code generation for reduction variables.
444 /// * Scalarization (implementation using scalars) of un-vectorizable
445 ///   instructions.
446 /// InnerLoopVectorizer does not perform any vectorization-legality
447 /// checks, and relies on the caller to check for the different legality
448 /// aspects. The InnerLoopVectorizer relies on the
449 /// LoopVectorizationLegality class to provide information about the induction
450 /// and reduction variables that were found to a given vectorization factor.
451 class InnerLoopVectorizer {
452 public:
453   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
454                       LoopInfo *LI, DominatorTree *DT,
455                       const TargetLibraryInfo *TLI,
456                       const TargetTransformInfo *TTI, AssumptionCache *AC,
457                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
458                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
459                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
460                       ProfileSummaryInfo *PSI)
461       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
462         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
463         Builder(PSE.getSE()->getContext()),
464         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
465         BFI(BFI), PSI(PSI) {
466     // Query this against the original loop and save it here because the profile
467     // of the original loop header may change as the transformation happens.
468     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
469         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
470   }
471 
472   virtual ~InnerLoopVectorizer() = default;
473 
474   /// Create a new empty loop that will contain vectorized instructions later
475   /// on, while the old loop will be used as the scalar remainder. Control flow
476   /// is generated around the vectorized (and scalar epilogue) loops consisting
477   /// of various checks and bypasses. Return the pre-header block of the new
478   /// loop.
479   /// In the case of epilogue vectorization, this function is overriden to
480   /// handle the more complex control flow around the loops.
481   virtual BasicBlock *createVectorizedLoopSkeleton();
482 
483   /// Widen a single instruction within the innermost loop.
484   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
485                         VPTransformState &State);
486 
487   /// Widen a single call instruction within the innermost loop.
488   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
489                             VPTransformState &State);
490 
491   /// Widen a single select instruction within the innermost loop.
492   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
493                               bool InvariantCond, VPTransformState &State);
494 
495   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
496   void fixVectorizedLoop();
497 
498   // Return true if any runtime check is added.
499   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
500 
501   /// A type for vectorized values in the new loop. Each value from the
502   /// original loop, when vectorized, is represented by UF vector values in the
503   /// new unrolled loop, where UF is the unroll factor.
504   using VectorParts = SmallVector<Value *, 2>;
505 
506   /// Vectorize a single GetElementPtrInst based on information gathered and
507   /// decisions taken during planning.
508   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
509                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
510                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
511 
512   /// Vectorize a single PHINode in a block. This method handles the induction
513   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
514   /// arbitrary length vectors.
515   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
516 
517   /// A helper function to scalarize a single Instruction in the innermost loop.
518   /// Generates a sequence of scalar instances for each lane between \p MinLane
519   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
520   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
521   /// Instr's operands.
522   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
523                             const VPIteration &Instance, bool IfPredicateInstr,
524                             VPTransformState &State);
525 
526   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
527   /// is provided, the integer induction variable will first be truncated to
528   /// the corresponding type.
529   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
530 
531   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
532   /// vector or scalar value on-demand if one is not yet available. When
533   /// vectorizing a loop, we visit the definition of an instruction before its
534   /// uses. When visiting the definition, we either vectorize or scalarize the
535   /// instruction, creating an entry for it in the corresponding map. (In some
536   /// cases, such as induction variables, we will create both vector and scalar
537   /// entries.) Then, as we encounter uses of the definition, we derive values
538   /// for each scalar or vector use unless such a value is already available.
539   /// For example, if we scalarize a definition and one of its uses is vector,
540   /// we build the required vector on-demand with an insertelement sequence
541   /// when visiting the use. Otherwise, if the use is scalar, we can use the
542   /// existing scalar definition.
543   ///
544   /// Return a value in the new loop corresponding to \p V from the original
545   /// loop at unroll index \p Part. If the value has already been vectorized,
546   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
547   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
548   /// a new vector value on-demand by inserting the scalar values into a vector
549   /// with an insertelement sequence. If the value has been neither vectorized
550   /// nor scalarized, it must be loop invariant, so we simply broadcast the
551   /// value into a vector.
552   Value *getOrCreateVectorValue(Value *V, unsigned Part);
553 
554   void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
555     VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
556   }
557 
558   /// Return a value in the new loop corresponding to \p V from the original
559   /// loop at unroll and vector indices \p Instance. If the value has been
560   /// vectorized but not scalarized, the necessary extractelement instruction
561   /// will be generated.
562   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
563 
564   /// Construct the vector value of a scalarized value \p V one lane at a time.
565   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
566 
567   /// Try to vectorize interleaved access group \p Group with the base address
568   /// given in \p Addr, optionally masking the vector operations if \p
569   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
570   /// values in the vectorized loop.
571   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
572                                 ArrayRef<VPValue *> VPDefs,
573                                 VPTransformState &State, VPValue *Addr,
574                                 ArrayRef<VPValue *> StoredValues,
575                                 VPValue *BlockInMask = nullptr);
576 
577   /// Vectorize Load and Store instructions with the base address given in \p
578   /// Addr, optionally masking the vector operations if \p BlockInMask is
579   /// non-null. Use \p State to translate given VPValues to IR values in the
580   /// vectorized loop.
581   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
582                                   VPValue *Def, VPValue *Addr,
583                                   VPValue *StoredValue, VPValue *BlockInMask);
584 
585   /// Set the debug location in the builder using the debug location in
586   /// the instruction.
587   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
588 
589   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
590   void fixNonInductionPHIs(void);
591 
592 protected:
593   friend class LoopVectorizationPlanner;
594 
595   /// A small list of PHINodes.
596   using PhiVector = SmallVector<PHINode *, 4>;
597 
598   /// A type for scalarized values in the new loop. Each value from the
599   /// original loop, when scalarized, is represented by UF x VF scalar values
600   /// in the new unrolled loop, where UF is the unroll factor and VF is the
601   /// vectorization factor.
602   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
603 
604   /// Set up the values of the IVs correctly when exiting the vector loop.
605   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
606                     Value *CountRoundDown, Value *EndValue,
607                     BasicBlock *MiddleBlock);
608 
609   /// Create a new induction variable inside L.
610   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
611                                    Value *Step, Instruction *DL);
612 
613   /// Handle all cross-iteration phis in the header.
614   void fixCrossIterationPHIs();
615 
616   /// Fix a first-order recurrence. This is the second phase of vectorizing
617   /// this phi node.
618   void fixFirstOrderRecurrence(PHINode *Phi);
619 
620   /// Fix a reduction cross-iteration phi. This is the second phase of
621   /// vectorizing this phi node.
622   void fixReduction(PHINode *Phi);
623 
624   /// Clear NSW/NUW flags from reduction instructions if necessary.
625   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
626 
627   /// The Loop exit block may have single value PHI nodes with some
628   /// incoming value. While vectorizing we only handled real values
629   /// that were defined inside the loop and we should have one value for
630   /// each predecessor of its parent basic block. See PR14725.
631   void fixLCSSAPHIs();
632 
633   /// Iteratively sink the scalarized operands of a predicated instruction into
634   /// the block that was created for it.
635   void sinkScalarOperands(Instruction *PredInst);
636 
637   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
638   /// represented as.
639   void truncateToMinimalBitwidths();
640 
641   /// Create a broadcast instruction. This method generates a broadcast
642   /// instruction (shuffle) for loop invariant values and for the induction
643   /// value. If this is the induction variable then we extend it to N, N+1, ...
644   /// this is needed because each iteration in the loop corresponds to a SIMD
645   /// element.
646   virtual Value *getBroadcastInstrs(Value *V);
647 
648   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
649   /// to each vector element of Val. The sequence starts at StartIndex.
650   /// \p Opcode is relevant for FP induction variable.
651   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
652                                Instruction::BinaryOps Opcode =
653                                Instruction::BinaryOpsEnd);
654 
655   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
656   /// variable on which to base the steps, \p Step is the size of the step, and
657   /// \p EntryVal is the value from the original loop that maps to the steps.
658   /// Note that \p EntryVal doesn't have to be an induction variable - it
659   /// can also be a truncate instruction.
660   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
661                         const InductionDescriptor &ID);
662 
663   /// Create a vector induction phi node based on an existing scalar one. \p
664   /// EntryVal is the value from the original loop that maps to the vector phi
665   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
666   /// truncate instruction, instead of widening the original IV, we widen a
667   /// version of the IV truncated to \p EntryVal's type.
668   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
669                                        Value *Step, Instruction *EntryVal);
670 
671   /// Returns true if an instruction \p I should be scalarized instead of
672   /// vectorized for the chosen vectorization factor.
673   bool shouldScalarizeInstruction(Instruction *I) const;
674 
675   /// Returns true if we should generate a scalar version of \p IV.
676   bool needsScalarInduction(Instruction *IV) const;
677 
678   /// If there is a cast involved in the induction variable \p ID, which should
679   /// be ignored in the vectorized loop body, this function records the
680   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
681   /// cast. We had already proved that the casted Phi is equal to the uncasted
682   /// Phi in the vectorized loop (under a runtime guard), and therefore
683   /// there is no need to vectorize the cast - the same value can be used in the
684   /// vector loop for both the Phi and the cast.
685   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
686   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
687   ///
688   /// \p EntryVal is the value from the original loop that maps to the vector
689   /// phi node and is used to distinguish what is the IV currently being
690   /// processed - original one (if \p EntryVal is a phi corresponding to the
691   /// original IV) or the "newly-created" one based on the proof mentioned above
692   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
693   /// latter case \p EntryVal is a TruncInst and we must not record anything for
694   /// that IV, but it's error-prone to expect callers of this routine to care
695   /// about that, hence this explicit parameter.
696   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
697                                              const Instruction *EntryVal,
698                                              Value *VectorLoopValue,
699                                              unsigned Part,
700                                              unsigned Lane = UINT_MAX);
701 
702   /// Generate a shuffle sequence that will reverse the vector Vec.
703   virtual Value *reverseVector(Value *Vec);
704 
705   /// Returns (and creates if needed) the original loop trip count.
706   Value *getOrCreateTripCount(Loop *NewLoop);
707 
708   /// Returns (and creates if needed) the trip count of the widened loop.
709   Value *getOrCreateVectorTripCount(Loop *NewLoop);
710 
711   /// Returns a bitcasted value to the requested vector type.
712   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
713   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
714                                 const DataLayout &DL);
715 
716   /// Emit a bypass check to see if the vector trip count is zero, including if
717   /// it overflows.
718   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
719 
720   /// Emit a bypass check to see if all of the SCEV assumptions we've
721   /// had to make are correct.
722   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
723 
724   /// Emit bypass checks to check any memory assumptions we may have made.
725   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
726 
727   /// Compute the transformed value of Index at offset StartValue using step
728   /// StepValue.
729   /// For integer induction, returns StartValue + Index * StepValue.
730   /// For pointer induction, returns StartValue[Index * StepValue].
731   /// FIXME: The newly created binary instructions should contain nsw/nuw
732   /// flags, which can be found from the original scalar operations.
733   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
734                               const DataLayout &DL,
735                               const InductionDescriptor &ID) const;
736 
737   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
738   /// vector loop preheader, middle block and scalar preheader. Also
739   /// allocate a loop object for the new vector loop and return it.
740   Loop *createVectorLoopSkeleton(StringRef Prefix);
741 
742   /// Create new phi nodes for the induction variables to resume iteration count
743   /// in the scalar epilogue, from where the vectorized loop left off (given by
744   /// \p VectorTripCount).
745   /// In cases where the loop skeleton is more complicated (eg. epilogue
746   /// vectorization) and the resume values can come from an additional bypass
747   /// block, the \p AdditionalBypass pair provides information about the bypass
748   /// block and the end value on the edge from bypass to this loop.
749   void createInductionResumeValues(
750       Loop *L, Value *VectorTripCount,
751       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
752 
753   /// Complete the loop skeleton by adding debug MDs, creating appropriate
754   /// conditional branches in the middle block, preparing the builder and
755   /// running the verifier. Take in the vector loop \p L as argument, and return
756   /// the preheader of the completed vector loop.
757   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
758 
759   /// Add additional metadata to \p To that was not present on \p Orig.
760   ///
761   /// Currently this is used to add the noalias annotations based on the
762   /// inserted memchecks.  Use this for instructions that are *cloned* into the
763   /// vector loop.
764   void addNewMetadata(Instruction *To, const Instruction *Orig);
765 
766   /// Add metadata from one instruction to another.
767   ///
768   /// This includes both the original MDs from \p From and additional ones (\see
769   /// addNewMetadata).  Use this for *newly created* instructions in the vector
770   /// loop.
771   void addMetadata(Instruction *To, Instruction *From);
772 
773   /// Similar to the previous function but it adds the metadata to a
774   /// vector of instructions.
775   void addMetadata(ArrayRef<Value *> To, Instruction *From);
776 
777   /// Allow subclasses to override and print debug traces before/after vplan
778   /// execution, when trace information is requested.
779   virtual void printDebugTracesAtStart(){};
780   virtual void printDebugTracesAtEnd(){};
781 
782   /// The original loop.
783   Loop *OrigLoop;
784 
785   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
786   /// dynamic knowledge to simplify SCEV expressions and converts them to a
787   /// more usable form.
788   PredicatedScalarEvolution &PSE;
789 
790   /// Loop Info.
791   LoopInfo *LI;
792 
793   /// Dominator Tree.
794   DominatorTree *DT;
795 
796   /// Alias Analysis.
797   AAResults *AA;
798 
799   /// Target Library Info.
800   const TargetLibraryInfo *TLI;
801 
802   /// Target Transform Info.
803   const TargetTransformInfo *TTI;
804 
805   /// Assumption Cache.
806   AssumptionCache *AC;
807 
808   /// Interface to emit optimization remarks.
809   OptimizationRemarkEmitter *ORE;
810 
811   /// LoopVersioning.  It's only set up (non-null) if memchecks were
812   /// used.
813   ///
814   /// This is currently only used to add no-alias metadata based on the
815   /// memchecks.  The actually versioning is performed manually.
816   std::unique_ptr<LoopVersioning> LVer;
817 
818   /// The vectorization SIMD factor to use. Each vector will have this many
819   /// vector elements.
820   ElementCount VF;
821 
822   /// The vectorization unroll factor to use. Each scalar is vectorized to this
823   /// many different vector instructions.
824   unsigned UF;
825 
826   /// The builder that we use
827   IRBuilder<> Builder;
828 
829   // --- Vectorization state ---
830 
831   /// The vector-loop preheader.
832   BasicBlock *LoopVectorPreHeader;
833 
834   /// The scalar-loop preheader.
835   BasicBlock *LoopScalarPreHeader;
836 
837   /// Middle Block between the vector and the scalar.
838   BasicBlock *LoopMiddleBlock;
839 
840   /// The ExitBlock of the scalar loop.
841   BasicBlock *LoopExitBlock;
842 
843   /// The vector loop body.
844   BasicBlock *LoopVectorBody;
845 
846   /// The scalar loop body.
847   BasicBlock *LoopScalarBody;
848 
849   /// A list of all bypass blocks. The first block is the entry of the loop.
850   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
851 
852   /// The new Induction variable which was added to the new block.
853   PHINode *Induction = nullptr;
854 
855   /// The induction variable of the old basic block.
856   PHINode *OldInduction = nullptr;
857 
858   /// Maps values from the original loop to their corresponding values in the
859   /// vectorized loop. A key value can map to either vector values, scalar
860   /// values or both kinds of values, depending on whether the key was
861   /// vectorized and scalarized.
862   VectorizerValueMap VectorLoopValueMap;
863 
864   /// Store instructions that were predicated.
865   SmallVector<Instruction *, 4> PredicatedInstructions;
866 
867   /// Trip count of the original loop.
868   Value *TripCount = nullptr;
869 
870   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
871   Value *VectorTripCount = nullptr;
872 
873   /// The legality analysis.
874   LoopVectorizationLegality *Legal;
875 
876   /// The profitablity analysis.
877   LoopVectorizationCostModel *Cost;
878 
879   // Record whether runtime checks are added.
880   bool AddedSafetyChecks = false;
881 
882   // Holds the end values for each induction variable. We save the end values
883   // so we can later fix-up the external users of the induction variables.
884   DenseMap<PHINode *, Value *> IVEndValues;
885 
886   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
887   // fixed up at the end of vector code generation.
888   SmallVector<PHINode *, 8> OrigPHIsToFix;
889 
890   /// BFI and PSI are used to check for profile guided size optimizations.
891   BlockFrequencyInfo *BFI;
892   ProfileSummaryInfo *PSI;
893 
894   // Whether this loop should be optimized for size based on profile guided size
895   // optimizatios.
896   bool OptForSizeBasedOnProfile;
897 };
898 
899 class InnerLoopUnroller : public InnerLoopVectorizer {
900 public:
901   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
902                     LoopInfo *LI, DominatorTree *DT,
903                     const TargetLibraryInfo *TLI,
904                     const TargetTransformInfo *TTI, AssumptionCache *AC,
905                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
906                     LoopVectorizationLegality *LVL,
907                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
908                     ProfileSummaryInfo *PSI)
909       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
910                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
911                             BFI, PSI) {}
912 
913 private:
914   Value *getBroadcastInstrs(Value *V) override;
915   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
916                        Instruction::BinaryOps Opcode =
917                        Instruction::BinaryOpsEnd) override;
918   Value *reverseVector(Value *Vec) override;
919 };
920 
921 /// Encapsulate information regarding vectorization of a loop and its epilogue.
922 /// This information is meant to be updated and used across two stages of
923 /// epilogue vectorization.
924 struct EpilogueLoopVectorizationInfo {
925   ElementCount MainLoopVF = ElementCount::getFixed(0);
926   unsigned MainLoopUF = 0;
927   ElementCount EpilogueVF = ElementCount::getFixed(0);
928   unsigned EpilogueUF = 0;
929   BasicBlock *MainLoopIterationCountCheck = nullptr;
930   BasicBlock *EpilogueIterationCountCheck = nullptr;
931   BasicBlock *SCEVSafetyCheck = nullptr;
932   BasicBlock *MemSafetyCheck = nullptr;
933   Value *TripCount = nullptr;
934   Value *VectorTripCount = nullptr;
935 
936   EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
937                                 unsigned EUF)
938       : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
939         EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
940     assert(EUF == 1 &&
941            "A high UF for the epilogue loop is likely not beneficial.");
942   }
943 };
944 
945 /// An extension of the inner loop vectorizer that creates a skeleton for a
946 /// vectorized loop that has its epilogue (residual) also vectorized.
947 /// The idea is to run the vplan on a given loop twice, firstly to setup the
948 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
949 /// from the first step and vectorize the epilogue.  This is achieved by
950 /// deriving two concrete strategy classes from this base class and invoking
951 /// them in succession from the loop vectorizer planner.
952 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
953 public:
954   InnerLoopAndEpilogueVectorizer(
955       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
956       DominatorTree *DT, const TargetLibraryInfo *TLI,
957       const TargetTransformInfo *TTI, AssumptionCache *AC,
958       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
959       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
960       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
961       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
962                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
963         EPI(EPI) {}
964 
965   // Override this function to handle the more complex control flow around the
966   // three loops.
967   BasicBlock *createVectorizedLoopSkeleton() final override {
968     return createEpilogueVectorizedLoopSkeleton();
969   }
970 
971   /// The interface for creating a vectorized skeleton using one of two
972   /// different strategies, each corresponding to one execution of the vplan
973   /// as described above.
974   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
975 
976   /// Holds and updates state information required to vectorize the main loop
977   /// and its epilogue in two separate passes. This setup helps us avoid
978   /// regenerating and recomputing runtime safety checks. It also helps us to
979   /// shorten the iteration-count-check path length for the cases where the
980   /// iteration count of the loop is so small that the main vector loop is
981   /// completely skipped.
982   EpilogueLoopVectorizationInfo &EPI;
983 };
984 
985 /// A specialized derived class of inner loop vectorizer that performs
986 /// vectorization of *main* loops in the process of vectorizing loops and their
987 /// epilogues.
988 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
989 public:
990   EpilogueVectorizerMainLoop(
991       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
992       DominatorTree *DT, const TargetLibraryInfo *TLI,
993       const TargetTransformInfo *TTI, AssumptionCache *AC,
994       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
995       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
996       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
997       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
998                                        EPI, LVL, CM, BFI, PSI) {}
999   /// Implements the interface for creating a vectorized skeleton using the
1000   /// *main loop* strategy (ie the first pass of vplan execution).
1001   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1002 
1003 protected:
1004   /// Emits an iteration count bypass check once for the main loop (when \p
1005   /// ForEpilogue is false) and once for the epilogue loop (when \p
1006   /// ForEpilogue is true).
1007   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
1008                                              bool ForEpilogue);
1009   void printDebugTracesAtStart() override;
1010   void printDebugTracesAtEnd() override;
1011 };
1012 
1013 // A specialized derived class of inner loop vectorizer that performs
1014 // vectorization of *epilogue* loops in the process of vectorizing loops and
1015 // their epilogues.
1016 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
1017 public:
1018   EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
1019                     LoopInfo *LI, DominatorTree *DT,
1020                     const TargetLibraryInfo *TLI,
1021                     const TargetTransformInfo *TTI, AssumptionCache *AC,
1022                     OptimizationRemarkEmitter *ORE,
1023                     EpilogueLoopVectorizationInfo &EPI,
1024                     LoopVectorizationLegality *LVL,
1025                     llvm::LoopVectorizationCostModel *CM,
1026                     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
1027       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1028                                        EPI, LVL, CM, BFI, PSI) {}
1029   /// Implements the interface for creating a vectorized skeleton using the
1030   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1031   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1032 
1033 protected:
1034   /// Emits an iteration count bypass check after the main vector loop has
1035   /// finished to see if there are any iterations left to execute by either
1036   /// the vector epilogue or the scalar epilogue.
1037   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1038                                                       BasicBlock *Bypass,
1039                                                       BasicBlock *Insert);
1040   void printDebugTracesAtStart() override;
1041   void printDebugTracesAtEnd() override;
1042 };
1043 } // end namespace llvm
1044 
1045 /// Look for a meaningful debug location on the instruction or it's
1046 /// operands.
1047 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1048   if (!I)
1049     return I;
1050 
1051   DebugLoc Empty;
1052   if (I->getDebugLoc() != Empty)
1053     return I;
1054 
1055   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
1056     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
1057       if (OpInst->getDebugLoc() != Empty)
1058         return OpInst;
1059   }
1060 
1061   return I;
1062 }
1063 
1064 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1065   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1066     const DILocation *DIL = Inst->getDebugLoc();
1067     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1068         !isa<DbgInfoIntrinsic>(Inst)) {
1069       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1070       auto NewDIL =
1071           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1072       if (NewDIL)
1073         B.SetCurrentDebugLocation(NewDIL.getValue());
1074       else
1075         LLVM_DEBUG(dbgs()
1076                    << "Failed to create new discriminator: "
1077                    << DIL->getFilename() << " Line: " << DIL->getLine());
1078     }
1079     else
1080       B.SetCurrentDebugLocation(DIL);
1081   } else
1082     B.SetCurrentDebugLocation(DebugLoc());
1083 }
1084 
1085 /// Write a record \p DebugMsg about vectorization failure to the debug
1086 /// output stream. If \p I is passed, it is an instruction that prevents
1087 /// vectorization.
1088 #ifndef NDEBUG
1089 static void debugVectorizationFailure(const StringRef DebugMsg,
1090     Instruction *I) {
1091   dbgs() << "LV: Not vectorizing: " << DebugMsg;
1092   if (I != nullptr)
1093     dbgs() << " " << *I;
1094   else
1095     dbgs() << '.';
1096   dbgs() << '\n';
1097 }
1098 #endif
1099 
1100 /// Create an analysis remark that explains why vectorization failed
1101 ///
1102 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1103 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1104 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1105 /// the location of the remark.  \return the remark object that can be
1106 /// streamed to.
1107 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1108     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1109   Value *CodeRegion = TheLoop->getHeader();
1110   DebugLoc DL = TheLoop->getStartLoc();
1111 
1112   if (I) {
1113     CodeRegion = I->getParent();
1114     // If there is no debug location attached to the instruction, revert back to
1115     // using the loop's.
1116     if (I->getDebugLoc())
1117       DL = I->getDebugLoc();
1118   }
1119 
1120   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1121   R << "loop not vectorized: ";
1122   return R;
1123 }
1124 
1125 /// Return a value for Step multiplied by VF.
1126 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1127   assert(isa<ConstantInt>(Step) && "Expected an integer step");
1128   Constant *StepVal = ConstantInt::get(
1129       Step->getType(),
1130       cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1131   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1132 }
1133 
1134 namespace llvm {
1135 
1136 void reportVectorizationFailure(const StringRef DebugMsg,
1137     const StringRef OREMsg, const StringRef ORETag,
1138     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1139   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
1140   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1141   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1142                 ORETag, TheLoop, I) << OREMsg);
1143 }
1144 
1145 } // end namespace llvm
1146 
1147 #ifndef NDEBUG
1148 /// \return string containing a file name and a line # for the given loop.
1149 static std::string getDebugLocString(const Loop *L) {
1150   std::string Result;
1151   if (L) {
1152     raw_string_ostream OS(Result);
1153     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1154       LoopDbgLoc.print(OS);
1155     else
1156       // Just print the module name.
1157       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1158     OS.flush();
1159   }
1160   return Result;
1161 }
1162 #endif
1163 
1164 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1165                                          const Instruction *Orig) {
1166   // If the loop was versioned with memchecks, add the corresponding no-alias
1167   // metadata.
1168   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1169     LVer->annotateInstWithNoAlias(To, Orig);
1170 }
1171 
1172 void InnerLoopVectorizer::addMetadata(Instruction *To,
1173                                       Instruction *From) {
1174   propagateMetadata(To, From);
1175   addNewMetadata(To, From);
1176 }
1177 
1178 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1179                                       Instruction *From) {
1180   for (Value *V : To) {
1181     if (Instruction *I = dyn_cast<Instruction>(V))
1182       addMetadata(I, From);
1183   }
1184 }
1185 
1186 namespace llvm {
1187 
1188 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1189 // lowered.
1190 enum ScalarEpilogueLowering {
1191 
1192   // The default: allowing scalar epilogues.
1193   CM_ScalarEpilogueAllowed,
1194 
1195   // Vectorization with OptForSize: don't allow epilogues.
1196   CM_ScalarEpilogueNotAllowedOptSize,
1197 
1198   // A special case of vectorisation with OptForSize: loops with a very small
1199   // trip count are considered for vectorization under OptForSize, thereby
1200   // making sure the cost of their loop body is dominant, free of runtime
1201   // guards and scalar iteration overheads.
1202   CM_ScalarEpilogueNotAllowedLowTripLoop,
1203 
1204   // Loop hint predicate indicating an epilogue is undesired.
1205   CM_ScalarEpilogueNotNeededUsePredicate,
1206 
1207   // Directive indicating we must either tail fold or not vectorize
1208   CM_ScalarEpilogueNotAllowedUsePredicate
1209 };
1210 
1211 /// LoopVectorizationCostModel - estimates the expected speedups due to
1212 /// vectorization.
1213 /// In many cases vectorization is not profitable. This can happen because of
1214 /// a number of reasons. In this class we mainly attempt to predict the
1215 /// expected speedup/slowdowns due to the supported instruction set. We use the
1216 /// TargetTransformInfo to query the different backends for the cost of
1217 /// different operations.
1218 class LoopVectorizationCostModel {
1219 public:
1220   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1221                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1222                              LoopVectorizationLegality *Legal,
1223                              const TargetTransformInfo &TTI,
1224                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1225                              AssumptionCache *AC,
1226                              OptimizationRemarkEmitter *ORE, const Function *F,
1227                              const LoopVectorizeHints *Hints,
1228                              InterleavedAccessInfo &IAI)
1229       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1230         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1231         Hints(Hints), InterleaveInfo(IAI) {}
1232 
1233   /// \return An upper bound for the vectorization factor, or None if
1234   /// vectorization and interleaving should be avoided up front.
1235   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1236 
1237   /// \return True if runtime checks are required for vectorization, and false
1238   /// otherwise.
1239   bool runtimeChecksRequired();
1240 
1241   /// \return The most profitable vectorization factor and the cost of that VF.
1242   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1243   /// then this vectorization factor will be selected if vectorization is
1244   /// possible.
1245   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1246   VectorizationFactor
1247   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1248                                     const LoopVectorizationPlanner &LVP);
1249 
1250   /// Setup cost-based decisions for user vectorization factor.
1251   void selectUserVectorizationFactor(ElementCount UserVF) {
1252     collectUniformsAndScalars(UserVF);
1253     collectInstsToScalarize(UserVF);
1254   }
1255 
1256   /// \return The size (in bits) of the smallest and widest types in the code
1257   /// that needs to be vectorized. We ignore values that remain scalar such as
1258   /// 64 bit loop indices.
1259   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1260 
1261   /// \return The desired interleave count.
1262   /// If interleave count has been specified by metadata it will be returned.
1263   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1264   /// are the selected vectorization factor and the cost of the selected VF.
1265   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1266 
1267   /// Memory access instruction may be vectorized in more than one way.
1268   /// Form of instruction after vectorization depends on cost.
1269   /// This function takes cost-based decisions for Load/Store instructions
1270   /// and collects them in a map. This decisions map is used for building
1271   /// the lists of loop-uniform and loop-scalar instructions.
1272   /// The calculated cost is saved with widening decision in order to
1273   /// avoid redundant calculations.
1274   void setCostBasedWideningDecision(ElementCount VF);
1275 
1276   /// A struct that represents some properties of the register usage
1277   /// of a loop.
1278   struct RegisterUsage {
1279     /// Holds the number of loop invariant values that are used in the loop.
1280     /// The key is ClassID of target-provided register class.
1281     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1282     /// Holds the maximum number of concurrent live intervals in the loop.
1283     /// The key is ClassID of target-provided register class.
1284     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1285   };
1286 
1287   /// \return Returns information about the register usages of the loop for the
1288   /// given vectorization factors.
1289   SmallVector<RegisterUsage, 8>
1290   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1291 
1292   /// Collect values we want to ignore in the cost model.
1293   void collectValuesToIgnore();
1294 
1295   /// Split reductions into those that happen in the loop, and those that happen
1296   /// outside. In loop reductions are collected into InLoopReductionChains.
1297   void collectInLoopReductions();
1298 
1299   /// \returns The smallest bitwidth each instruction can be represented with.
1300   /// The vector equivalents of these instructions should be truncated to this
1301   /// type.
1302   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1303     return MinBWs;
1304   }
1305 
1306   /// \returns True if it is more profitable to scalarize instruction \p I for
1307   /// vectorization factor \p VF.
1308   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1309     assert(VF.isVector() &&
1310            "Profitable to scalarize relevant only for VF > 1.");
1311 
1312     // Cost model is not run in the VPlan-native path - return conservative
1313     // result until this changes.
1314     if (EnableVPlanNativePath)
1315       return false;
1316 
1317     auto Scalars = InstsToScalarize.find(VF);
1318     assert(Scalars != InstsToScalarize.end() &&
1319            "VF not yet analyzed for scalarization profitability");
1320     return Scalars->second.find(I) != Scalars->second.end();
1321   }
1322 
1323   /// Returns true if \p I is known to be uniform after vectorization.
1324   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1325     if (VF.isScalar())
1326       return true;
1327 
1328     // Cost model is not run in the VPlan-native path - return conservative
1329     // result until this changes.
1330     if (EnableVPlanNativePath)
1331       return false;
1332 
1333     auto UniformsPerVF = Uniforms.find(VF);
1334     assert(UniformsPerVF != Uniforms.end() &&
1335            "VF not yet analyzed for uniformity");
1336     return UniformsPerVF->second.count(I);
1337   }
1338 
1339   /// Returns true if \p I is known to be scalar after vectorization.
1340   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1341     if (VF.isScalar())
1342       return true;
1343 
1344     // Cost model is not run in the VPlan-native path - return conservative
1345     // result until this changes.
1346     if (EnableVPlanNativePath)
1347       return false;
1348 
1349     auto ScalarsPerVF = Scalars.find(VF);
1350     assert(ScalarsPerVF != Scalars.end() &&
1351            "Scalar values are not calculated for VF");
1352     return ScalarsPerVF->second.count(I);
1353   }
1354 
1355   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1356   /// for vectorization factor \p VF.
1357   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1358     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1359            !isProfitableToScalarize(I, VF) &&
1360            !isScalarAfterVectorization(I, VF);
1361   }
1362 
1363   /// Decision that was taken during cost calculation for memory instruction.
1364   enum InstWidening {
1365     CM_Unknown,
1366     CM_Widen,         // For consecutive accesses with stride +1.
1367     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1368     CM_Interleave,
1369     CM_GatherScatter,
1370     CM_Scalarize
1371   };
1372 
1373   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1374   /// instruction \p I and vector width \p VF.
1375   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1376                            unsigned Cost) {
1377     assert(VF.isVector() && "Expected VF >=2");
1378     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1379   }
1380 
1381   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1382   /// interleaving group \p Grp and vector width \p VF.
1383   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1384                            ElementCount VF, InstWidening W, unsigned Cost) {
1385     assert(VF.isVector() && "Expected VF >=2");
1386     /// Broadcast this decicion to all instructions inside the group.
1387     /// But the cost will be assigned to one instruction only.
1388     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1389       if (auto *I = Grp->getMember(i)) {
1390         if (Grp->getInsertPos() == I)
1391           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1392         else
1393           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1394       }
1395     }
1396   }
1397 
1398   /// Return the cost model decision for the given instruction \p I and vector
1399   /// width \p VF. Return CM_Unknown if this instruction did not pass
1400   /// through the cost modeling.
1401   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1402     assert(VF.isVector() && "Expected VF to be a vector VF");
1403     // Cost model is not run in the VPlan-native path - return conservative
1404     // result until this changes.
1405     if (EnableVPlanNativePath)
1406       return CM_GatherScatter;
1407 
1408     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1409     auto Itr = WideningDecisions.find(InstOnVF);
1410     if (Itr == WideningDecisions.end())
1411       return CM_Unknown;
1412     return Itr->second.first;
1413   }
1414 
1415   /// Return the vectorization cost for the given instruction \p I and vector
1416   /// width \p VF.
1417   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1418     assert(VF.isVector() && "Expected VF >=2");
1419     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1420     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1421            "The cost is not calculated");
1422     return WideningDecisions[InstOnVF].second;
1423   }
1424 
1425   /// Return True if instruction \p I is an optimizable truncate whose operand
1426   /// is an induction variable. Such a truncate will be removed by adding a new
1427   /// induction variable with the destination type.
1428   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1429     // If the instruction is not a truncate, return false.
1430     auto *Trunc = dyn_cast<TruncInst>(I);
1431     if (!Trunc)
1432       return false;
1433 
1434     // Get the source and destination types of the truncate.
1435     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1436     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1437 
1438     // If the truncate is free for the given types, return false. Replacing a
1439     // free truncate with an induction variable would add an induction variable
1440     // update instruction to each iteration of the loop. We exclude from this
1441     // check the primary induction variable since it will need an update
1442     // instruction regardless.
1443     Value *Op = Trunc->getOperand(0);
1444     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1445       return false;
1446 
1447     // If the truncated value is not an induction variable, return false.
1448     return Legal->isInductionPhi(Op);
1449   }
1450 
1451   /// Collects the instructions to scalarize for each predicated instruction in
1452   /// the loop.
1453   void collectInstsToScalarize(ElementCount VF);
1454 
1455   /// Collect Uniform and Scalar values for the given \p VF.
1456   /// The sets depend on CM decision for Load/Store instructions
1457   /// that may be vectorized as interleave, gather-scatter or scalarized.
1458   void collectUniformsAndScalars(ElementCount VF) {
1459     // Do the analysis once.
1460     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1461       return;
1462     setCostBasedWideningDecision(VF);
1463     collectLoopUniforms(VF);
1464     collectLoopScalars(VF);
1465   }
1466 
1467   /// Returns true if the target machine supports masked store operation
1468   /// for the given \p DataType and kind of access to \p Ptr.
1469   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1470     return Legal->isConsecutivePtr(Ptr) &&
1471            TTI.isLegalMaskedStore(DataType, Alignment);
1472   }
1473 
1474   /// Returns true if the target machine supports masked load operation
1475   /// for the given \p DataType and kind of access to \p Ptr.
1476   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1477     return Legal->isConsecutivePtr(Ptr) &&
1478            TTI.isLegalMaskedLoad(DataType, Alignment);
1479   }
1480 
1481   /// Returns true if the target machine supports masked scatter operation
1482   /// for the given \p DataType.
1483   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1484     return TTI.isLegalMaskedScatter(DataType, Alignment);
1485   }
1486 
1487   /// Returns true if the target machine supports masked gather operation
1488   /// for the given \p DataType.
1489   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1490     return TTI.isLegalMaskedGather(DataType, Alignment);
1491   }
1492 
1493   /// Returns true if the target machine can represent \p V as a masked gather
1494   /// or scatter operation.
1495   bool isLegalGatherOrScatter(Value *V) {
1496     bool LI = isa<LoadInst>(V);
1497     bool SI = isa<StoreInst>(V);
1498     if (!LI && !SI)
1499       return false;
1500     auto *Ty = getMemInstValueType(V);
1501     Align Align = getLoadStoreAlignment(V);
1502     return (LI && isLegalMaskedGather(Ty, Align)) ||
1503            (SI && isLegalMaskedScatter(Ty, Align));
1504   }
1505 
1506   /// Returns true if \p I is an instruction that will be scalarized with
1507   /// predication. Such instructions include conditional stores and
1508   /// instructions that may divide by zero.
1509   /// If a non-zero VF has been calculated, we check if I will be scalarized
1510   /// predication for that VF.
1511   bool isScalarWithPredication(Instruction *I,
1512                                ElementCount VF = ElementCount::getFixed(1));
1513 
1514   // Returns true if \p I is an instruction that will be predicated either
1515   // through scalar predication or masked load/store or masked gather/scatter.
1516   // Superset of instructions that return true for isScalarWithPredication.
1517   bool isPredicatedInst(Instruction *I) {
1518     if (!blockNeedsPredication(I->getParent()))
1519       return false;
1520     // Loads and stores that need some form of masked operation are predicated
1521     // instructions.
1522     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1523       return Legal->isMaskRequired(I);
1524     return isScalarWithPredication(I);
1525   }
1526 
1527   /// Returns true if \p I is a memory instruction with consecutive memory
1528   /// access that can be widened.
1529   bool
1530   memoryInstructionCanBeWidened(Instruction *I,
1531                                 ElementCount VF = ElementCount::getFixed(1));
1532 
1533   /// Returns true if \p I is a memory instruction in an interleaved-group
1534   /// of memory accesses that can be vectorized with wide vector loads/stores
1535   /// and shuffles.
1536   bool
1537   interleavedAccessCanBeWidened(Instruction *I,
1538                                 ElementCount VF = ElementCount::getFixed(1));
1539 
1540   /// Check if \p Instr belongs to any interleaved access group.
1541   bool isAccessInterleaved(Instruction *Instr) {
1542     return InterleaveInfo.isInterleaved(Instr);
1543   }
1544 
1545   /// Get the interleaved access group that \p Instr belongs to.
1546   const InterleaveGroup<Instruction> *
1547   getInterleavedAccessGroup(Instruction *Instr) {
1548     return InterleaveInfo.getInterleaveGroup(Instr);
1549   }
1550 
1551   /// Returns true if an interleaved group requires a scalar iteration
1552   /// to handle accesses with gaps, and there is nothing preventing us from
1553   /// creating a scalar epilogue.
1554   bool requiresScalarEpilogue() const {
1555     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1556   }
1557 
1558   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1559   /// loop hint annotation.
1560   bool isScalarEpilogueAllowed() const {
1561     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1562   }
1563 
1564   /// Returns true if all loop blocks should be masked to fold tail loop.
1565   bool foldTailByMasking() const { return FoldTailByMasking; }
1566 
1567   bool blockNeedsPredication(BasicBlock *BB) {
1568     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1569   }
1570 
1571   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1572   /// nodes to the chain of instructions representing the reductions. Uses a
1573   /// MapVector to ensure deterministic iteration order.
1574   using ReductionChainMap =
1575       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1576 
1577   /// Return the chain of instructions representing an inloop reduction.
1578   const ReductionChainMap &getInLoopReductionChains() const {
1579     return InLoopReductionChains;
1580   }
1581 
1582   /// Returns true if the Phi is part of an inloop reduction.
1583   bool isInLoopReduction(PHINode *Phi) const {
1584     return InLoopReductionChains.count(Phi);
1585   }
1586 
1587   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1588   /// with factor VF.  Return the cost of the instruction, including
1589   /// scalarization overhead if it's needed.
1590   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1591 
1592   /// Estimate cost of a call instruction CI if it were vectorized with factor
1593   /// VF. Return the cost of the instruction, including scalarization overhead
1594   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1595   /// scalarized -
1596   /// i.e. either vector version isn't available, or is too expensive.
1597   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1598                              bool &NeedToScalarize);
1599 
1600   /// Invalidates decisions already taken by the cost model.
1601   void invalidateCostModelingDecisions() {
1602     WideningDecisions.clear();
1603     Uniforms.clear();
1604     Scalars.clear();
1605   }
1606 
1607 private:
1608   unsigned NumPredStores = 0;
1609 
1610   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1611   /// than zero. One is returned if vectorization should best be avoided due
1612   /// to cost.
1613   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1614                                     ElementCount UserVF);
1615 
1616   /// The vectorization cost is a combination of the cost itself and a boolean
1617   /// indicating whether any of the contributing operations will actually
1618   /// operate on
1619   /// vector values after type legalization in the backend. If this latter value
1620   /// is
1621   /// false, then all operations will be scalarized (i.e. no vectorization has
1622   /// actually taken place).
1623   using VectorizationCostTy = std::pair<unsigned, bool>;
1624 
1625   /// Returns the expected execution cost. The unit of the cost does
1626   /// not matter because we use the 'cost' units to compare different
1627   /// vector widths. The cost that is returned is *not* normalized by
1628   /// the factor width.
1629   VectorizationCostTy expectedCost(ElementCount VF);
1630 
1631   /// Returns the execution time cost of an instruction for a given vector
1632   /// width. Vector width of one means scalar.
1633   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1634 
1635   /// The cost-computation logic from getInstructionCost which provides
1636   /// the vector type as an output parameter.
1637   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1638 
1639   /// Calculate vectorization cost of memory instruction \p I.
1640   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1641 
1642   /// The cost computation for scalarized memory instruction.
1643   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1644 
1645   /// The cost computation for interleaving group of memory instructions.
1646   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1647 
1648   /// The cost computation for Gather/Scatter instruction.
1649   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1650 
1651   /// The cost computation for widening instruction \p I with consecutive
1652   /// memory access.
1653   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1654 
1655   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1656   /// Load: scalar load + broadcast.
1657   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1658   /// element)
1659   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1660 
1661   /// Estimate the overhead of scalarizing an instruction. This is a
1662   /// convenience wrapper for the type-based getScalarizationOverhead API.
1663   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1664 
1665   /// Returns whether the instruction is a load or store and will be a emitted
1666   /// as a vector operation.
1667   bool isConsecutiveLoadOrStore(Instruction *I);
1668 
1669   /// Returns true if an artificially high cost for emulated masked memrefs
1670   /// should be used.
1671   bool useEmulatedMaskMemRefHack(Instruction *I);
1672 
1673   /// Map of scalar integer values to the smallest bitwidth they can be legally
1674   /// represented as. The vector equivalents of these values should be truncated
1675   /// to this type.
1676   MapVector<Instruction *, uint64_t> MinBWs;
1677 
1678   /// A type representing the costs for instructions if they were to be
1679   /// scalarized rather than vectorized. The entries are Instruction-Cost
1680   /// pairs.
1681   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1682 
1683   /// A set containing all BasicBlocks that are known to present after
1684   /// vectorization as a predicated block.
1685   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1686 
1687   /// Records whether it is allowed to have the original scalar loop execute at
1688   /// least once. This may be needed as a fallback loop in case runtime
1689   /// aliasing/dependence checks fail, or to handle the tail/remainder
1690   /// iterations when the trip count is unknown or doesn't divide by the VF,
1691   /// or as a peel-loop to handle gaps in interleave-groups.
1692   /// Under optsize and when the trip count is very small we don't allow any
1693   /// iterations to execute in the scalar loop.
1694   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1695 
1696   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1697   bool FoldTailByMasking = false;
1698 
1699   /// A map holding scalar costs for different vectorization factors. The
1700   /// presence of a cost for an instruction in the mapping indicates that the
1701   /// instruction will be scalarized when vectorizing with the associated
1702   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1703   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1704 
1705   /// Holds the instructions known to be uniform after vectorization.
1706   /// The data is collected per VF.
1707   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1708 
1709   /// Holds the instructions known to be scalar after vectorization.
1710   /// The data is collected per VF.
1711   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1712 
1713   /// Holds the instructions (address computations) that are forced to be
1714   /// scalarized.
1715   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1716 
1717   /// PHINodes of the reductions that should be expanded in-loop along with
1718   /// their associated chains of reduction operations, in program order from top
1719   /// (PHI) to bottom
1720   ReductionChainMap InLoopReductionChains;
1721 
1722   /// Returns the expected difference in cost from scalarizing the expression
1723   /// feeding a predicated instruction \p PredInst. The instructions to
1724   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1725   /// non-negative return value implies the expression will be scalarized.
1726   /// Currently, only single-use chains are considered for scalarization.
1727   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1728                               ElementCount VF);
1729 
1730   /// Collect the instructions that are uniform after vectorization. An
1731   /// instruction is uniform if we represent it with a single scalar value in
1732   /// the vectorized loop corresponding to each vector iteration. Examples of
1733   /// uniform instructions include pointer operands of consecutive or
1734   /// interleaved memory accesses. Note that although uniformity implies an
1735   /// instruction will be scalar, the reverse is not true. In general, a
1736   /// scalarized instruction will be represented by VF scalar values in the
1737   /// vectorized loop, each corresponding to an iteration of the original
1738   /// scalar loop.
1739   void collectLoopUniforms(ElementCount VF);
1740 
1741   /// Collect the instructions that are scalar after vectorization. An
1742   /// instruction is scalar if it is known to be uniform or will be scalarized
1743   /// during vectorization. Non-uniform scalarized instructions will be
1744   /// represented by VF values in the vectorized loop, each corresponding to an
1745   /// iteration of the original scalar loop.
1746   void collectLoopScalars(ElementCount VF);
1747 
1748   /// Keeps cost model vectorization decision and cost for instructions.
1749   /// Right now it is used for memory instructions only.
1750   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1751                                 std::pair<InstWidening, unsigned>>;
1752 
1753   DecisionList WideningDecisions;
1754 
1755   /// Returns true if \p V is expected to be vectorized and it needs to be
1756   /// extracted.
1757   bool needsExtract(Value *V, ElementCount VF) const {
1758     Instruction *I = dyn_cast<Instruction>(V);
1759     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1760         TheLoop->isLoopInvariant(I))
1761       return false;
1762 
1763     // Assume we can vectorize V (and hence we need extraction) if the
1764     // scalars are not computed yet. This can happen, because it is called
1765     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1766     // the scalars are collected. That should be a safe assumption in most
1767     // cases, because we check if the operands have vectorizable types
1768     // beforehand in LoopVectorizationLegality.
1769     return Scalars.find(VF) == Scalars.end() ||
1770            !isScalarAfterVectorization(I, VF);
1771   };
1772 
1773   /// Returns a range containing only operands needing to be extracted.
1774   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1775                                                    ElementCount VF) {
1776     return SmallVector<Value *, 4>(make_filter_range(
1777         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1778   }
1779 
1780   /// Determines if we have the infrastructure to vectorize loop \p L and its
1781   /// epilogue, assuming the main loop is vectorized by \p VF.
1782   bool isCandidateForEpilogueVectorization(const Loop &L,
1783                                            const ElementCount VF) const;
1784 
1785   /// Returns true if epilogue vectorization is considered profitable, and
1786   /// false otherwise.
1787   /// \p VF is the vectorization factor chosen for the original loop.
1788   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1789 
1790 public:
1791   /// The loop that we evaluate.
1792   Loop *TheLoop;
1793 
1794   /// Predicated scalar evolution analysis.
1795   PredicatedScalarEvolution &PSE;
1796 
1797   /// Loop Info analysis.
1798   LoopInfo *LI;
1799 
1800   /// Vectorization legality.
1801   LoopVectorizationLegality *Legal;
1802 
1803   /// Vector target information.
1804   const TargetTransformInfo &TTI;
1805 
1806   /// Target Library Info.
1807   const TargetLibraryInfo *TLI;
1808 
1809   /// Demanded bits analysis.
1810   DemandedBits *DB;
1811 
1812   /// Assumption cache.
1813   AssumptionCache *AC;
1814 
1815   /// Interface to emit optimization remarks.
1816   OptimizationRemarkEmitter *ORE;
1817 
1818   const Function *TheFunction;
1819 
1820   /// Loop Vectorize Hint.
1821   const LoopVectorizeHints *Hints;
1822 
1823   /// The interleave access information contains groups of interleaved accesses
1824   /// with the same stride and close to each other.
1825   InterleavedAccessInfo &InterleaveInfo;
1826 
1827   /// Values to ignore in the cost model.
1828   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1829 
1830   /// Values to ignore in the cost model when VF > 1.
1831   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1832 
1833   /// Profitable vector factors.
1834   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1835 };
1836 
1837 } // end namespace llvm
1838 
1839 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1840 // vectorization. The loop needs to be annotated with #pragma omp simd
1841 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1842 // vector length information is not provided, vectorization is not considered
1843 // explicit. Interleave hints are not allowed either. These limitations will be
1844 // relaxed in the future.
1845 // Please, note that we are currently forced to abuse the pragma 'clang
1846 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1847 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1848 // provides *explicit vectorization hints* (LV can bypass legal checks and
1849 // assume that vectorization is legal). However, both hints are implemented
1850 // using the same metadata (llvm.loop.vectorize, processed by
1851 // LoopVectorizeHints). This will be fixed in the future when the native IR
1852 // representation for pragma 'omp simd' is introduced.
1853 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1854                                    OptimizationRemarkEmitter *ORE) {
1855   assert(!OuterLp->isInnermost() && "This is not an outer loop");
1856   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1857 
1858   // Only outer loops with an explicit vectorization hint are supported.
1859   // Unannotated outer loops are ignored.
1860   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1861     return false;
1862 
1863   Function *Fn = OuterLp->getHeader()->getParent();
1864   if (!Hints.allowVectorization(Fn, OuterLp,
1865                                 true /*VectorizeOnlyWhenForced*/)) {
1866     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1867     return false;
1868   }
1869 
1870   if (Hints.getInterleave() > 1) {
1871     // TODO: Interleave support is future work.
1872     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1873                          "outer loops.\n");
1874     Hints.emitRemarkWithHints();
1875     return false;
1876   }
1877 
1878   return true;
1879 }
1880 
1881 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1882                                   OptimizationRemarkEmitter *ORE,
1883                                   SmallVectorImpl<Loop *> &V) {
1884   // Collect inner loops and outer loops without irreducible control flow. For
1885   // now, only collect outer loops that have explicit vectorization hints. If we
1886   // are stress testing the VPlan H-CFG construction, we collect the outermost
1887   // loop of every loop nest.
1888   if (L.isInnermost() || VPlanBuildStressTest ||
1889       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1890     LoopBlocksRPO RPOT(&L);
1891     RPOT.perform(LI);
1892     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1893       V.push_back(&L);
1894       // TODO: Collect inner loops inside marked outer loops in case
1895       // vectorization fails for the outer loop. Do not invoke
1896       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1897       // already known to be reducible. We can use an inherited attribute for
1898       // that.
1899       return;
1900     }
1901   }
1902   for (Loop *InnerL : L)
1903     collectSupportedLoops(*InnerL, LI, ORE, V);
1904 }
1905 
1906 namespace {
1907 
1908 /// The LoopVectorize Pass.
1909 struct LoopVectorize : public FunctionPass {
1910   /// Pass identification, replacement for typeid
1911   static char ID;
1912 
1913   LoopVectorizePass Impl;
1914 
1915   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1916                          bool VectorizeOnlyWhenForced = false)
1917       : FunctionPass(ID),
1918         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1919     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1920   }
1921 
1922   bool runOnFunction(Function &F) override {
1923     if (skipFunction(F))
1924       return false;
1925 
1926     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1927     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1928     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1929     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1930     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1931     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1932     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1933     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1934     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1935     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1936     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1937     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1938     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1939 
1940     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1941         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1942 
1943     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1944                         GetLAA, *ORE, PSI).MadeAnyChange;
1945   }
1946 
1947   void getAnalysisUsage(AnalysisUsage &AU) const override {
1948     AU.addRequired<AssumptionCacheTracker>();
1949     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1950     AU.addRequired<DominatorTreeWrapperPass>();
1951     AU.addRequired<LoopInfoWrapperPass>();
1952     AU.addRequired<ScalarEvolutionWrapperPass>();
1953     AU.addRequired<TargetTransformInfoWrapperPass>();
1954     AU.addRequired<AAResultsWrapperPass>();
1955     AU.addRequired<LoopAccessLegacyAnalysis>();
1956     AU.addRequired<DemandedBitsWrapperPass>();
1957     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1958     AU.addRequired<InjectTLIMappingsLegacy>();
1959 
1960     // We currently do not preserve loopinfo/dominator analyses with outer loop
1961     // vectorization. Until this is addressed, mark these analyses as preserved
1962     // only for non-VPlan-native path.
1963     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1964     if (!EnableVPlanNativePath) {
1965       AU.addPreserved<LoopInfoWrapperPass>();
1966       AU.addPreserved<DominatorTreeWrapperPass>();
1967     }
1968 
1969     AU.addPreserved<BasicAAWrapperPass>();
1970     AU.addPreserved<GlobalsAAWrapperPass>();
1971     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1972   }
1973 };
1974 
1975 } // end anonymous namespace
1976 
1977 //===----------------------------------------------------------------------===//
1978 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1979 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1980 //===----------------------------------------------------------------------===//
1981 
1982 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1983   // We need to place the broadcast of invariant variables outside the loop,
1984   // but only if it's proven safe to do so. Else, broadcast will be inside
1985   // vector loop body.
1986   Instruction *Instr = dyn_cast<Instruction>(V);
1987   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1988                      (!Instr ||
1989                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1990   // Place the code for broadcasting invariant variables in the new preheader.
1991   IRBuilder<>::InsertPointGuard Guard(Builder);
1992   if (SafeToHoist)
1993     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1994 
1995   // Broadcast the scalar into all locations in the vector.
1996   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1997 
1998   return Shuf;
1999 }
2000 
2001 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2002     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
2003   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2004          "Expected either an induction phi-node or a truncate of it!");
2005   Value *Start = II.getStartValue();
2006 
2007   // Construct the initial value of the vector IV in the vector loop preheader
2008   auto CurrIP = Builder.saveIP();
2009   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2010   if (isa<TruncInst>(EntryVal)) {
2011     assert(Start->getType()->isIntegerTy() &&
2012            "Truncation requires an integer type");
2013     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2014     Step = Builder.CreateTrunc(Step, TruncType);
2015     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2016   }
2017   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2018   Value *SteppedStart =
2019       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2020 
2021   // We create vector phi nodes for both integer and floating-point induction
2022   // variables. Here, we determine the kind of arithmetic we will perform.
2023   Instruction::BinaryOps AddOp;
2024   Instruction::BinaryOps MulOp;
2025   if (Step->getType()->isIntegerTy()) {
2026     AddOp = Instruction::Add;
2027     MulOp = Instruction::Mul;
2028   } else {
2029     AddOp = II.getInductionOpcode();
2030     MulOp = Instruction::FMul;
2031   }
2032 
2033   // Multiply the vectorization factor by the step using integer or
2034   // floating-point arithmetic as appropriate.
2035   Value *ConstVF =
2036       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
2037   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2038 
2039   // Create a vector splat to use in the induction update.
2040   //
2041   // FIXME: If the step is non-constant, we create the vector splat with
2042   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2043   //        handle a constant vector splat.
2044   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2045   Value *SplatVF = isa<Constant>(Mul)
2046                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2047                        : Builder.CreateVectorSplat(VF, Mul);
2048   Builder.restoreIP(CurrIP);
2049 
2050   // We may need to add the step a number of times, depending on the unroll
2051   // factor. The last of those goes into the PHI.
2052   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2053                                     &*LoopVectorBody->getFirstInsertionPt());
2054   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2055   Instruction *LastInduction = VecInd;
2056   for (unsigned Part = 0; Part < UF; ++Part) {
2057     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2058 
2059     if (isa<TruncInst>(EntryVal))
2060       addMetadata(LastInduction, EntryVal);
2061     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
2062 
2063     LastInduction = cast<Instruction>(addFastMathFlag(
2064         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2065     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2066   }
2067 
2068   // Move the last step to the end of the latch block. This ensures consistent
2069   // placement of all induction updates.
2070   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2071   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2072   auto *ICmp = cast<Instruction>(Br->getCondition());
2073   LastInduction->moveBefore(ICmp);
2074   LastInduction->setName("vec.ind.next");
2075 
2076   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2077   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2078 }
2079 
2080 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2081   return Cost->isScalarAfterVectorization(I, VF) ||
2082          Cost->isProfitableToScalarize(I, VF);
2083 }
2084 
2085 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2086   if (shouldScalarizeInstruction(IV))
2087     return true;
2088   auto isScalarInst = [&](User *U) -> bool {
2089     auto *I = cast<Instruction>(U);
2090     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2091   };
2092   return llvm::any_of(IV->users(), isScalarInst);
2093 }
2094 
2095 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2096     const InductionDescriptor &ID, const Instruction *EntryVal,
2097     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
2098   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2099          "Expected either an induction phi-node or a truncate of it!");
2100 
2101   // This induction variable is not the phi from the original loop but the
2102   // newly-created IV based on the proof that casted Phi is equal to the
2103   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2104   // re-uses the same InductionDescriptor that original IV uses but we don't
2105   // have to do any recording in this case - that is done when original IV is
2106   // processed.
2107   if (isa<TruncInst>(EntryVal))
2108     return;
2109 
2110   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2111   if (Casts.empty())
2112     return;
2113   // Only the first Cast instruction in the Casts vector is of interest.
2114   // The rest of the Casts (if exist) have no uses outside the
2115   // induction update chain itself.
2116   Instruction *CastInst = *Casts.begin();
2117   if (Lane < UINT_MAX)
2118     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
2119   else
2120     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
2121 }
2122 
2123 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
2124   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2125          "Primary induction variable must have an integer type");
2126 
2127   auto II = Legal->getInductionVars().find(IV);
2128   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2129 
2130   auto ID = II->second;
2131   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2132 
2133   // The value from the original loop to which we are mapping the new induction
2134   // variable.
2135   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2136 
2137   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2138 
2139   // Generate code for the induction step. Note that induction steps are
2140   // required to be loop-invariant
2141   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2142     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2143            "Induction step should be loop invariant");
2144     if (PSE.getSE()->isSCEVable(IV->getType())) {
2145       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2146       return Exp.expandCodeFor(Step, Step->getType(),
2147                                LoopVectorPreHeader->getTerminator());
2148     }
2149     return cast<SCEVUnknown>(Step)->getValue();
2150   };
2151 
2152   // The scalar value to broadcast. This is derived from the canonical
2153   // induction variable. If a truncation type is given, truncate the canonical
2154   // induction variable and step. Otherwise, derive these values from the
2155   // induction descriptor.
2156   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2157     Value *ScalarIV = Induction;
2158     if (IV != OldInduction) {
2159       ScalarIV = IV->getType()->isIntegerTy()
2160                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2161                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2162                                           IV->getType());
2163       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2164       ScalarIV->setName("offset.idx");
2165     }
2166     if (Trunc) {
2167       auto *TruncType = cast<IntegerType>(Trunc->getType());
2168       assert(Step->getType()->isIntegerTy() &&
2169              "Truncation requires an integer step");
2170       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2171       Step = Builder.CreateTrunc(Step, TruncType);
2172     }
2173     return ScalarIV;
2174   };
2175 
2176   // Create the vector values from the scalar IV, in the absence of creating a
2177   // vector IV.
2178   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2179     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2180     for (unsigned Part = 0; Part < UF; ++Part) {
2181       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2182       Value *EntryPart =
2183           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2184                         ID.getInductionOpcode());
2185       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2186       if (Trunc)
2187         addMetadata(EntryPart, Trunc);
2188       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2189     }
2190   };
2191 
2192   // Now do the actual transformations, and start with creating the step value.
2193   Value *Step = CreateStepValue(ID.getStep());
2194   if (VF.isZero() || VF.isScalar()) {
2195     Value *ScalarIV = CreateScalarIV(Step);
2196     CreateSplatIV(ScalarIV, Step);
2197     return;
2198   }
2199 
2200   // Determine if we want a scalar version of the induction variable. This is
2201   // true if the induction variable itself is not widened, or if it has at
2202   // least one user in the loop that is not widened.
2203   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2204   if (!NeedsScalarIV) {
2205     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2206     return;
2207   }
2208 
2209   // Try to create a new independent vector induction variable. If we can't
2210   // create the phi node, we will splat the scalar induction variable in each
2211   // loop iteration.
2212   if (!shouldScalarizeInstruction(EntryVal)) {
2213     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2214     Value *ScalarIV = CreateScalarIV(Step);
2215     // Create scalar steps that can be used by instructions we will later
2216     // scalarize. Note that the addition of the scalar steps will not increase
2217     // the number of instructions in the loop in the common case prior to
2218     // InstCombine. We will be trading one vector extract for each scalar step.
2219     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2220     return;
2221   }
2222 
2223   // All IV users are scalar instructions, so only emit a scalar IV, not a
2224   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2225   // predicate used by the masked loads/stores.
2226   Value *ScalarIV = CreateScalarIV(Step);
2227   if (!Cost->isScalarEpilogueAllowed())
2228     CreateSplatIV(ScalarIV, Step);
2229   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2230 }
2231 
2232 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2233                                           Instruction::BinaryOps BinOp) {
2234   // Create and check the types.
2235   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2236   int VLen = ValVTy->getNumElements();
2237 
2238   Type *STy = Val->getType()->getScalarType();
2239   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2240          "Induction Step must be an integer or FP");
2241   assert(Step->getType() == STy && "Step has wrong type");
2242 
2243   SmallVector<Constant *, 8> Indices;
2244 
2245   if (STy->isIntegerTy()) {
2246     // Create a vector of consecutive numbers from zero to VF.
2247     for (int i = 0; i < VLen; ++i)
2248       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2249 
2250     // Add the consecutive indices to the vector value.
2251     Constant *Cv = ConstantVector::get(Indices);
2252     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2253     Step = Builder.CreateVectorSplat(VLen, Step);
2254     assert(Step->getType() == Val->getType() && "Invalid step vec");
2255     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2256     // which can be found from the original scalar operations.
2257     Step = Builder.CreateMul(Cv, Step);
2258     return Builder.CreateAdd(Val, Step, "induction");
2259   }
2260 
2261   // Floating point induction.
2262   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2263          "Binary Opcode should be specified for FP induction");
2264   // Create a vector of consecutive numbers from zero to VF.
2265   for (int i = 0; i < VLen; ++i)
2266     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2267 
2268   // Add the consecutive indices to the vector value.
2269   Constant *Cv = ConstantVector::get(Indices);
2270 
2271   Step = Builder.CreateVectorSplat(VLen, Step);
2272 
2273   // Floating point operations had to be 'fast' to enable the induction.
2274   FastMathFlags Flags;
2275   Flags.setFast();
2276 
2277   Value *MulOp = Builder.CreateFMul(Cv, Step);
2278   if (isa<Instruction>(MulOp))
2279     // Have to check, MulOp may be a constant
2280     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2281 
2282   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2283   if (isa<Instruction>(BOp))
2284     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2285   return BOp;
2286 }
2287 
2288 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2289                                            Instruction *EntryVal,
2290                                            const InductionDescriptor &ID) {
2291   // We shouldn't have to build scalar steps if we aren't vectorizing.
2292   assert(VF.isVector() && "VF should be greater than one");
2293   // Get the value type and ensure it and the step have the same integer type.
2294   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2295   assert(ScalarIVTy == Step->getType() &&
2296          "Val and Step should have the same type");
2297 
2298   // We build scalar steps for both integer and floating-point induction
2299   // variables. Here, we determine the kind of arithmetic we will perform.
2300   Instruction::BinaryOps AddOp;
2301   Instruction::BinaryOps MulOp;
2302   if (ScalarIVTy->isIntegerTy()) {
2303     AddOp = Instruction::Add;
2304     MulOp = Instruction::Mul;
2305   } else {
2306     AddOp = ID.getInductionOpcode();
2307     MulOp = Instruction::FMul;
2308   }
2309 
2310   // Determine the number of scalars we need to generate for each unroll
2311   // iteration. If EntryVal is uniform, we only need to generate the first
2312   // lane. Otherwise, we generate all VF values.
2313   unsigned Lanes =
2314       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2315           ? 1
2316           : VF.getKnownMinValue();
2317   assert((!VF.isScalable() || Lanes == 1) &&
2318          "Should never scalarize a scalable vector");
2319   // Compute the scalar steps and save the results in VectorLoopValueMap.
2320   for (unsigned Part = 0; Part < UF; ++Part) {
2321     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2322       auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2323                                          ScalarIVTy->getScalarSizeInBits());
2324       Value *StartIdx =
2325           createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2326       if (ScalarIVTy->isFloatingPointTy())
2327         StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
2328       StartIdx = addFastMathFlag(Builder.CreateBinOp(
2329           AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
2330       // The step returned by `createStepForVF` is a runtime-evaluated value
2331       // when VF is scalable. Otherwise, it should be folded into a Constant.
2332       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2333              "Expected StartIdx to be folded to a constant when VF is not "
2334              "scalable");
2335       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2336       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2337       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2338       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2339     }
2340   }
2341 }
2342 
2343 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2344   assert(V != Induction && "The new induction variable should not be used.");
2345   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2346   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2347 
2348   // If we have a stride that is replaced by one, do it here. Defer this for
2349   // the VPlan-native path until we start running Legal checks in that path.
2350   if (!EnableVPlanNativePath && Legal->hasStride(V))
2351     V = ConstantInt::get(V->getType(), 1);
2352 
2353   // If we have a vector mapped to this value, return it.
2354   if (VectorLoopValueMap.hasVectorValue(V, Part))
2355     return VectorLoopValueMap.getVectorValue(V, Part);
2356 
2357   // If the value has not been vectorized, check if it has been scalarized
2358   // instead. If it has been scalarized, and we actually need the value in
2359   // vector form, we will construct the vector values on demand.
2360   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2361     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2362 
2363     // If we've scalarized a value, that value should be an instruction.
2364     auto *I = cast<Instruction>(V);
2365 
2366     // If we aren't vectorizing, we can just copy the scalar map values over to
2367     // the vector map.
2368     if (VF.isScalar()) {
2369       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2370       return ScalarValue;
2371     }
2372 
2373     // Get the last scalar instruction we generated for V and Part. If the value
2374     // is known to be uniform after vectorization, this corresponds to lane zero
2375     // of the Part unroll iteration. Otherwise, the last instruction is the one
2376     // we created for the last vector lane of the Part unroll iteration.
2377     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2378                             ? 0
2379                             : VF.getKnownMinValue() - 1;
2380     assert((!VF.isScalable() || LastLane == 0) &&
2381            "Scalable vectorization can't lead to any scalarized values.");
2382     auto *LastInst = cast<Instruction>(
2383         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2384 
2385     // Set the insert point after the last scalarized instruction. This ensures
2386     // the insertelement sequence will directly follow the scalar definitions.
2387     auto OldIP = Builder.saveIP();
2388     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2389     Builder.SetInsertPoint(&*NewIP);
2390 
2391     // However, if we are vectorizing, we need to construct the vector values.
2392     // If the value is known to be uniform after vectorization, we can just
2393     // broadcast the scalar value corresponding to lane zero for each unroll
2394     // iteration. Otherwise, we construct the vector values using insertelement
2395     // instructions. Since the resulting vectors are stored in
2396     // VectorLoopValueMap, we will only generate the insertelements once.
2397     Value *VectorValue = nullptr;
2398     if (Cost->isUniformAfterVectorization(I, VF)) {
2399       VectorValue = getBroadcastInstrs(ScalarValue);
2400       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2401     } else {
2402       // Initialize packing with insertelements to start from undef.
2403       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2404       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2405       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2406       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2407         packScalarIntoVectorValue(V, {Part, Lane});
2408       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2409     }
2410     Builder.restoreIP(OldIP);
2411     return VectorValue;
2412   }
2413 
2414   // If this scalar is unknown, assume that it is a constant or that it is
2415   // loop invariant. Broadcast V and save the value for future uses.
2416   Value *B = getBroadcastInstrs(V);
2417   VectorLoopValueMap.setVectorValue(V, Part, B);
2418   return B;
2419 }
2420 
2421 Value *
2422 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2423                                             const VPIteration &Instance) {
2424   // If the value is not an instruction contained in the loop, it should
2425   // already be scalar.
2426   if (OrigLoop->isLoopInvariant(V))
2427     return V;
2428 
2429   assert(Instance.Lane > 0
2430              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2431              : true && "Uniform values only have lane zero");
2432 
2433   // If the value from the original loop has not been vectorized, it is
2434   // represented by UF x VF scalar values in the new loop. Return the requested
2435   // scalar value.
2436   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2437     return VectorLoopValueMap.getScalarValue(V, Instance);
2438 
2439   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2440   // for the given unroll part. If this entry is not a vector type (i.e., the
2441   // vectorization factor is one), there is no need to generate an
2442   // extractelement instruction.
2443   auto *U = getOrCreateVectorValue(V, Instance.Part);
2444   if (!U->getType()->isVectorTy()) {
2445     assert(VF.isScalar() && "Value not scalarized has non-vector type");
2446     return U;
2447   }
2448 
2449   // Otherwise, the value from the original loop has been vectorized and is
2450   // represented by UF vector values. Extract and return the requested scalar
2451   // value from the appropriate vector lane.
2452   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2453 }
2454 
2455 void InnerLoopVectorizer::packScalarIntoVectorValue(
2456     Value *V, const VPIteration &Instance) {
2457   assert(V != Induction && "The new induction variable should not be used.");
2458   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2459   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2460 
2461   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2462   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2463   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2464                                             Builder.getInt32(Instance.Lane));
2465   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2466 }
2467 
2468 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2469   assert(Vec->getType()->isVectorTy() && "Invalid type");
2470   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2471   SmallVector<int, 8> ShuffleMask;
2472   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2473     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2474 
2475   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2476 }
2477 
2478 // Return whether we allow using masked interleave-groups (for dealing with
2479 // strided loads/stores that reside in predicated blocks, or for dealing
2480 // with gaps).
2481 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2482   // If an override option has been passed in for interleaved accesses, use it.
2483   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2484     return EnableMaskedInterleavedMemAccesses;
2485 
2486   return TTI.enableMaskedInterleavedAccessVectorization();
2487 }
2488 
2489 // Try to vectorize the interleave group that \p Instr belongs to.
2490 //
2491 // E.g. Translate following interleaved load group (factor = 3):
2492 //   for (i = 0; i < N; i+=3) {
2493 //     R = Pic[i];             // Member of index 0
2494 //     G = Pic[i+1];           // Member of index 1
2495 //     B = Pic[i+2];           // Member of index 2
2496 //     ... // do something to R, G, B
2497 //   }
2498 // To:
2499 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2500 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2501 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2502 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2503 //
2504 // Or translate following interleaved store group (factor = 3):
2505 //   for (i = 0; i < N; i+=3) {
2506 //     ... do something to R, G, B
2507 //     Pic[i]   = R;           // Member of index 0
2508 //     Pic[i+1] = G;           // Member of index 1
2509 //     Pic[i+2] = B;           // Member of index 2
2510 //   }
2511 // To:
2512 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2513 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2514 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2515 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2516 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2517 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2518     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2519     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2520     VPValue *BlockInMask) {
2521   Instruction *Instr = Group->getInsertPos();
2522   const DataLayout &DL = Instr->getModule()->getDataLayout();
2523 
2524   // Prepare for the vector type of the interleaved load/store.
2525   Type *ScalarTy = getMemInstValueType(Instr);
2526   unsigned InterleaveFactor = Group->getFactor();
2527   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2528   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2529 
2530   // Prepare for the new pointers.
2531   SmallVector<Value *, 2> AddrParts;
2532   unsigned Index = Group->getIndex(Instr);
2533 
2534   // TODO: extend the masked interleaved-group support to reversed access.
2535   assert((!BlockInMask || !Group->isReverse()) &&
2536          "Reversed masked interleave-group not supported.");
2537 
2538   // If the group is reverse, adjust the index to refer to the last vector lane
2539   // instead of the first. We adjust the index from the first vector lane,
2540   // rather than directly getting the pointer for lane VF - 1, because the
2541   // pointer operand of the interleaved access is supposed to be uniform. For
2542   // uniform instructions, we're only required to generate a value for the
2543   // first vector lane in each unroll iteration.
2544   assert(!VF.isScalable() &&
2545          "scalable vector reverse operation is not implemented");
2546   if (Group->isReverse())
2547     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2548 
2549   for (unsigned Part = 0; Part < UF; Part++) {
2550     Value *AddrPart = State.get(Addr, {Part, 0});
2551     setDebugLocFromInst(Builder, AddrPart);
2552 
2553     // Notice current instruction could be any index. Need to adjust the address
2554     // to the member of index 0.
2555     //
2556     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2557     //       b = A[i];       // Member of index 0
2558     // Current pointer is pointed to A[i+1], adjust it to A[i].
2559     //
2560     // E.g.  A[i+1] = a;     // Member of index 1
2561     //       A[i]   = b;     // Member of index 0
2562     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2563     // Current pointer is pointed to A[i+2], adjust it to A[i].
2564 
2565     bool InBounds = false;
2566     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2567       InBounds = gep->isInBounds();
2568     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2569     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2570 
2571     // Cast to the vector pointer type.
2572     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2573     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2574     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2575   }
2576 
2577   setDebugLocFromInst(Builder, Instr);
2578   Value *UndefVec = UndefValue::get(VecTy);
2579 
2580   Value *MaskForGaps = nullptr;
2581   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2582     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2583     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2584     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2585   }
2586 
2587   // Vectorize the interleaved load group.
2588   if (isa<LoadInst>(Instr)) {
2589     // For each unroll part, create a wide load for the group.
2590     SmallVector<Value *, 2> NewLoads;
2591     for (unsigned Part = 0; Part < UF; Part++) {
2592       Instruction *NewLoad;
2593       if (BlockInMask || MaskForGaps) {
2594         assert(useMaskedInterleavedAccesses(*TTI) &&
2595                "masked interleaved groups are not allowed.");
2596         Value *GroupMask = MaskForGaps;
2597         if (BlockInMask) {
2598           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2599           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2600           Value *ShuffledMask = Builder.CreateShuffleVector(
2601               BlockInMaskPart,
2602               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2603               "interleaved.mask");
2604           GroupMask = MaskForGaps
2605                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2606                                                 MaskForGaps)
2607                           : ShuffledMask;
2608         }
2609         NewLoad =
2610             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2611                                      GroupMask, UndefVec, "wide.masked.vec");
2612       }
2613       else
2614         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2615                                             Group->getAlign(), "wide.vec");
2616       Group->addMetadata(NewLoad);
2617       NewLoads.push_back(NewLoad);
2618     }
2619 
2620     // For each member in the group, shuffle out the appropriate data from the
2621     // wide loads.
2622     unsigned J = 0;
2623     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2624       Instruction *Member = Group->getMember(I);
2625 
2626       // Skip the gaps in the group.
2627       if (!Member)
2628         continue;
2629 
2630       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2631       auto StrideMask =
2632           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2633       for (unsigned Part = 0; Part < UF; Part++) {
2634         Value *StridedVec = Builder.CreateShuffleVector(
2635             NewLoads[Part], StrideMask, "strided.vec");
2636 
2637         // If this member has different type, cast the result type.
2638         if (Member->getType() != ScalarTy) {
2639           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2640           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2641           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2642         }
2643 
2644         if (Group->isReverse())
2645           StridedVec = reverseVector(StridedVec);
2646 
2647         State.set(VPDefs[J], Member, StridedVec, Part);
2648       }
2649       ++J;
2650     }
2651     return;
2652   }
2653 
2654   // The sub vector type for current instruction.
2655   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2656   auto *SubVT = VectorType::get(ScalarTy, VF);
2657 
2658   // Vectorize the interleaved store group.
2659   for (unsigned Part = 0; Part < UF; Part++) {
2660     // Collect the stored vector from each member.
2661     SmallVector<Value *, 4> StoredVecs;
2662     for (unsigned i = 0; i < InterleaveFactor; i++) {
2663       // Interleaved store group doesn't allow a gap, so each index has a member
2664       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2665 
2666       Value *StoredVec = State.get(StoredValues[i], Part);
2667 
2668       if (Group->isReverse())
2669         StoredVec = reverseVector(StoredVec);
2670 
2671       // If this member has different type, cast it to a unified type.
2672 
2673       if (StoredVec->getType() != SubVT)
2674         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2675 
2676       StoredVecs.push_back(StoredVec);
2677     }
2678 
2679     // Concatenate all vectors into a wide vector.
2680     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2681 
2682     // Interleave the elements in the wide vector.
2683     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2684     Value *IVec = Builder.CreateShuffleVector(
2685         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2686         "interleaved.vec");
2687 
2688     Instruction *NewStoreInstr;
2689     if (BlockInMask) {
2690       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2691       Value *ShuffledMask = Builder.CreateShuffleVector(
2692           BlockInMaskPart,
2693           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2694           "interleaved.mask");
2695       NewStoreInstr = Builder.CreateMaskedStore(
2696           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2697     }
2698     else
2699       NewStoreInstr =
2700           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2701 
2702     Group->addMetadata(NewStoreInstr);
2703   }
2704 }
2705 
2706 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2707     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2708     VPValue *StoredValue, VPValue *BlockInMask) {
2709   // Attempt to issue a wide load.
2710   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2711   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2712 
2713   assert((LI || SI) && "Invalid Load/Store instruction");
2714   assert((!SI || StoredValue) && "No stored value provided for widened store");
2715   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2716 
2717   LoopVectorizationCostModel::InstWidening Decision =
2718       Cost->getWideningDecision(Instr, VF);
2719   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2720           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2721           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2722          "CM decision is not to widen the memory instruction");
2723 
2724   Type *ScalarDataTy = getMemInstValueType(Instr);
2725 
2726   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2727   const Align Alignment = getLoadStoreAlignment(Instr);
2728 
2729   // Determine if the pointer operand of the access is either consecutive or
2730   // reverse consecutive.
2731   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2732   bool ConsecutiveStride =
2733       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2734   bool CreateGatherScatter =
2735       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2736 
2737   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2738   // gather/scatter. Otherwise Decision should have been to Scalarize.
2739   assert((ConsecutiveStride || CreateGatherScatter) &&
2740          "The instruction should be scalarized");
2741   (void)ConsecutiveStride;
2742 
2743   VectorParts BlockInMaskParts(UF);
2744   bool isMaskRequired = BlockInMask;
2745   if (isMaskRequired)
2746     for (unsigned Part = 0; Part < UF; ++Part)
2747       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2748 
2749   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2750     // Calculate the pointer for the specific unroll-part.
2751     GetElementPtrInst *PartPtr = nullptr;
2752 
2753     bool InBounds = false;
2754     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2755       InBounds = gep->isInBounds();
2756 
2757     if (Reverse) {
2758       assert(!VF.isScalable() &&
2759              "Reversing vectors is not yet supported for scalable vectors.");
2760 
2761       // If the address is consecutive but reversed, then the
2762       // wide store needs to start at the last vector element.
2763       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2764           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2765       PartPtr->setIsInBounds(InBounds);
2766       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2767           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2768       PartPtr->setIsInBounds(InBounds);
2769       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2770         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2771     } else {
2772       Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2773       PartPtr = cast<GetElementPtrInst>(
2774           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2775       PartPtr->setIsInBounds(InBounds);
2776     }
2777 
2778     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2779     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2780   };
2781 
2782   // Handle Stores:
2783   if (SI) {
2784     setDebugLocFromInst(Builder, SI);
2785 
2786     for (unsigned Part = 0; Part < UF; ++Part) {
2787       Instruction *NewSI = nullptr;
2788       Value *StoredVal = State.get(StoredValue, Part);
2789       if (CreateGatherScatter) {
2790         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2791         Value *VectorGep = State.get(Addr, Part);
2792         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2793                                             MaskPart);
2794       } else {
2795         if (Reverse) {
2796           // If we store to reverse consecutive memory locations, then we need
2797           // to reverse the order of elements in the stored value.
2798           StoredVal = reverseVector(StoredVal);
2799           // We don't want to update the value in the map as it might be used in
2800           // another expression. So don't call resetVectorValue(StoredVal).
2801         }
2802         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2803         if (isMaskRequired)
2804           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2805                                             BlockInMaskParts[Part]);
2806         else
2807           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2808       }
2809       addMetadata(NewSI, SI);
2810     }
2811     return;
2812   }
2813 
2814   // Handle loads.
2815   assert(LI && "Must have a load instruction");
2816   setDebugLocFromInst(Builder, LI);
2817   for (unsigned Part = 0; Part < UF; ++Part) {
2818     Value *NewLI;
2819     if (CreateGatherScatter) {
2820       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2821       Value *VectorGep = State.get(Addr, Part);
2822       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2823                                          nullptr, "wide.masked.gather");
2824       addMetadata(NewLI, LI);
2825     } else {
2826       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2827       if (isMaskRequired)
2828         NewLI = Builder.CreateMaskedLoad(
2829             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2830             "wide.masked.load");
2831       else
2832         NewLI =
2833             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2834 
2835       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2836       addMetadata(NewLI, LI);
2837       if (Reverse)
2838         NewLI = reverseVector(NewLI);
2839     }
2840 
2841     State.set(Def, Instr, NewLI, Part);
2842   }
2843 }
2844 
2845 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2846                                                const VPIteration &Instance,
2847                                                bool IfPredicateInstr,
2848                                                VPTransformState &State) {
2849   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2850 
2851   setDebugLocFromInst(Builder, Instr);
2852 
2853   // Does this instruction return a value ?
2854   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2855 
2856   Instruction *Cloned = Instr->clone();
2857   if (!IsVoidRetTy)
2858     Cloned->setName(Instr->getName() + ".cloned");
2859 
2860   // Replace the operands of the cloned instructions with their scalar
2861   // equivalents in the new loop.
2862   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2863     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2864     auto InputInstance = Instance;
2865     if (!Operand || !OrigLoop->contains(Operand) ||
2866         (Cost->isUniformAfterVectorization(Operand, State.VF)))
2867       InputInstance.Lane = 0;
2868     auto *NewOp = State.get(User.getOperand(op), InputInstance);
2869     Cloned->setOperand(op, NewOp);
2870   }
2871   addNewMetadata(Cloned, Instr);
2872 
2873   // Place the cloned scalar in the new loop.
2874   Builder.Insert(Cloned);
2875 
2876   // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
2877   // representing scalar values in VPTransformState. Add the cloned scalar to
2878   // the scalar map entry.
2879   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2880 
2881   // If we just cloned a new assumption, add it the assumption cache.
2882   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2883     if (II->getIntrinsicID() == Intrinsic::assume)
2884       AC->registerAssumption(II);
2885 
2886   // End if-block.
2887   if (IfPredicateInstr)
2888     PredicatedInstructions.push_back(Cloned);
2889 }
2890 
2891 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2892                                                       Value *End, Value *Step,
2893                                                       Instruction *DL) {
2894   BasicBlock *Header = L->getHeader();
2895   BasicBlock *Latch = L->getLoopLatch();
2896   // As we're just creating this loop, it's possible no latch exists
2897   // yet. If so, use the header as this will be a single block loop.
2898   if (!Latch)
2899     Latch = Header;
2900 
2901   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2902   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2903   setDebugLocFromInst(Builder, OldInst);
2904   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2905 
2906   Builder.SetInsertPoint(Latch->getTerminator());
2907   setDebugLocFromInst(Builder, OldInst);
2908 
2909   // Create i+1 and fill the PHINode.
2910   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2911   Induction->addIncoming(Start, L->getLoopPreheader());
2912   Induction->addIncoming(Next, Latch);
2913   // Create the compare.
2914   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2915   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2916 
2917   // Now we have two terminators. Remove the old one from the block.
2918   Latch->getTerminator()->eraseFromParent();
2919 
2920   return Induction;
2921 }
2922 
2923 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2924   if (TripCount)
2925     return TripCount;
2926 
2927   assert(L && "Create Trip Count for null loop.");
2928   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2929   // Find the loop boundaries.
2930   ScalarEvolution *SE = PSE.getSE();
2931   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2932   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2933          "Invalid loop count");
2934 
2935   Type *IdxTy = Legal->getWidestInductionType();
2936   assert(IdxTy && "No type for induction");
2937 
2938   // The exit count might have the type of i64 while the phi is i32. This can
2939   // happen if we have an induction variable that is sign extended before the
2940   // compare. The only way that we get a backedge taken count is that the
2941   // induction variable was signed and as such will not overflow. In such a case
2942   // truncation is legal.
2943   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2944       IdxTy->getPrimitiveSizeInBits())
2945     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2946   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2947 
2948   // Get the total trip count from the count by adding 1.
2949   const SCEV *ExitCount = SE->getAddExpr(
2950       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2951 
2952   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2953 
2954   // Expand the trip count and place the new instructions in the preheader.
2955   // Notice that the pre-header does not change, only the loop body.
2956   SCEVExpander Exp(*SE, DL, "induction");
2957 
2958   // Count holds the overall loop count (N).
2959   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2960                                 L->getLoopPreheader()->getTerminator());
2961 
2962   if (TripCount->getType()->isPointerTy())
2963     TripCount =
2964         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2965                                     L->getLoopPreheader()->getTerminator());
2966 
2967   return TripCount;
2968 }
2969 
2970 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2971   if (VectorTripCount)
2972     return VectorTripCount;
2973 
2974   Value *TC = getOrCreateTripCount(L);
2975   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2976 
2977   Type *Ty = TC->getType();
2978   // This is where we can make the step a runtime constant.
2979   Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
2980 
2981   // If the tail is to be folded by masking, round the number of iterations N
2982   // up to a multiple of Step instead of rounding down. This is done by first
2983   // adding Step-1 and then rounding down. Note that it's ok if this addition
2984   // overflows: the vector induction variable will eventually wrap to zero given
2985   // that it starts at zero and its Step is a power of two; the loop will then
2986   // exit, with the last early-exit vector comparison also producing all-true.
2987   if (Cost->foldTailByMasking()) {
2988     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2989            "VF*UF must be a power of 2 when folding tail by masking");
2990     assert(!VF.isScalable() &&
2991            "Tail folding not yet supported for scalable vectors");
2992     TC = Builder.CreateAdd(
2993         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2994   }
2995 
2996   // Now we need to generate the expression for the part of the loop that the
2997   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2998   // iterations are not required for correctness, or N - Step, otherwise. Step
2999   // is equal to the vectorization factor (number of SIMD elements) times the
3000   // unroll factor (number of SIMD instructions).
3001   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3002 
3003   // If there is a non-reversed interleaved group that may speculatively access
3004   // memory out-of-bounds, we need to ensure that there will be at least one
3005   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
3006   // the trip count, we set the remainder to be equal to the step. If the step
3007   // does not evenly divide the trip count, no adjustment is necessary since
3008   // there will already be scalar iterations. Note that the minimum iterations
3009   // check ensures that N >= Step.
3010   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3011     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3012     R = Builder.CreateSelect(IsZero, Step, R);
3013   }
3014 
3015   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3016 
3017   return VectorTripCount;
3018 }
3019 
3020 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3021                                                    const DataLayout &DL) {
3022   // Verify that V is a vector type with same number of elements as DstVTy.
3023   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3024   unsigned VF = DstFVTy->getNumElements();
3025   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3026   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3027   Type *SrcElemTy = SrcVecTy->getElementType();
3028   Type *DstElemTy = DstFVTy->getElementType();
3029   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3030          "Vector elements must have same size");
3031 
3032   // Do a direct cast if element types are castable.
3033   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3034     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3035   }
3036   // V cannot be directly casted to desired vector type.
3037   // May happen when V is a floating point vector but DstVTy is a vector of
3038   // pointers or vice-versa. Handle this using a two-step bitcast using an
3039   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3040   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3041          "Only one type should be a pointer type");
3042   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3043          "Only one type should be a floating point type");
3044   Type *IntTy =
3045       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3046   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3047   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3048   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3049 }
3050 
3051 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3052                                                          BasicBlock *Bypass) {
3053   Value *Count = getOrCreateTripCount(L);
3054   // Reuse existing vector loop preheader for TC checks.
3055   // Note that new preheader block is generated for vector loop.
3056   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3057   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3058 
3059   // Generate code to check if the loop's trip count is less than VF * UF, or
3060   // equal to it in case a scalar epilogue is required; this implies that the
3061   // vector trip count is zero. This check also covers the case where adding one
3062   // to the backedge-taken count overflowed leading to an incorrect trip count
3063   // of zero. In this case we will also jump to the scalar loop.
3064   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3065                                           : ICmpInst::ICMP_ULT;
3066 
3067   // If tail is to be folded, vector loop takes care of all iterations.
3068   Value *CheckMinIters = Builder.getFalse();
3069   if (!Cost->foldTailByMasking()) {
3070     Value *Step =
3071         createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3072     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3073   }
3074   // Create new preheader for vector loop.
3075   LoopVectorPreHeader =
3076       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3077                  "vector.ph");
3078 
3079   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3080                                DT->getNode(Bypass)->getIDom()) &&
3081          "TC check is expected to dominate Bypass");
3082 
3083   // Update dominator for Bypass & LoopExit.
3084   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3085   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3086 
3087   ReplaceInstWithInst(
3088       TCCheckBlock->getTerminator(),
3089       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3090   LoopBypassBlocks.push_back(TCCheckBlock);
3091 }
3092 
3093 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3094   // Reuse existing vector loop preheader for SCEV checks.
3095   // Note that new preheader block is generated for vector loop.
3096   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
3097 
3098   // Generate the code to check that the SCEV assumptions that we made.
3099   // We want the new basic block to start at the first instruction in a
3100   // sequence of instructions that form a check.
3101   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3102                    "scev.check");
3103   Value *SCEVCheck = Exp.expandCodeForPredicate(
3104       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
3105 
3106   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3107     if (C->isZero())
3108       return;
3109 
3110   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3111            (OptForSizeBasedOnProfile &&
3112             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3113          "Cannot SCEV check stride or overflow when optimizing for size");
3114 
3115   SCEVCheckBlock->setName("vector.scevcheck");
3116   // Create new preheader for vector loop.
3117   LoopVectorPreHeader =
3118       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
3119                  nullptr, "vector.ph");
3120 
3121   // Update dominator only if this is first RT check.
3122   if (LoopBypassBlocks.empty()) {
3123     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3124     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3125   }
3126 
3127   ReplaceInstWithInst(
3128       SCEVCheckBlock->getTerminator(),
3129       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
3130   LoopBypassBlocks.push_back(SCEVCheckBlock);
3131   AddedSafetyChecks = true;
3132 }
3133 
3134 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3135   // VPlan-native path does not do any analysis for runtime checks currently.
3136   if (EnableVPlanNativePath)
3137     return;
3138 
3139   // Reuse existing vector loop preheader for runtime memory checks.
3140   // Note that new preheader block is generated for vector loop.
3141   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
3142 
3143   // Generate the code that checks in runtime if arrays overlap. We put the
3144   // checks into a separate block to make the more common case of few elements
3145   // faster.
3146   auto *LAI = Legal->getLAI();
3147   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
3148   if (!RtPtrChecking.Need)
3149     return;
3150 
3151   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3152     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3153            "Cannot emit memory checks when optimizing for size, unless forced "
3154            "to vectorize.");
3155     ORE->emit([&]() {
3156       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3157                                         L->getStartLoc(), L->getHeader())
3158              << "Code-size may be reduced by not forcing "
3159                 "vectorization, or by source-code modifications "
3160                 "eliminating the need for runtime checks "
3161                 "(e.g., adding 'restrict').";
3162     });
3163   }
3164 
3165   MemCheckBlock->setName("vector.memcheck");
3166   // Create new preheader for vector loop.
3167   LoopVectorPreHeader =
3168       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
3169                  "vector.ph");
3170 
3171   auto *CondBranch = cast<BranchInst>(
3172       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
3173   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
3174   LoopBypassBlocks.push_back(MemCheckBlock);
3175   AddedSafetyChecks = true;
3176 
3177   // Update dominator only if this is first RT check.
3178   if (LoopBypassBlocks.empty()) {
3179     DT->changeImmediateDominator(Bypass, MemCheckBlock);
3180     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
3181   }
3182 
3183   Instruction *FirstCheckInst;
3184   Instruction *MemRuntimeCheck;
3185   std::tie(FirstCheckInst, MemRuntimeCheck) =
3186       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
3187                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
3188   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
3189                             "claimed checks are required");
3190   CondBranch->setCondition(MemRuntimeCheck);
3191 
3192   // We currently don't use LoopVersioning for the actual loop cloning but we
3193   // still use it to add the noalias metadata.
3194   LVer = std::make_unique<LoopVersioning>(
3195       *Legal->getLAI(),
3196       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3197       DT, PSE.getSE());
3198   LVer->prepareNoAliasMetadata();
3199 }
3200 
3201 Value *InnerLoopVectorizer::emitTransformedIndex(
3202     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3203     const InductionDescriptor &ID) const {
3204 
3205   SCEVExpander Exp(*SE, DL, "induction");
3206   auto Step = ID.getStep();
3207   auto StartValue = ID.getStartValue();
3208   assert(Index->getType() == Step->getType() &&
3209          "Index type does not match StepValue type");
3210 
3211   // Note: the IR at this point is broken. We cannot use SE to create any new
3212   // SCEV and then expand it, hoping that SCEV's simplification will give us
3213   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3214   // lead to various SCEV crashes. So all we can do is to use builder and rely
3215   // on InstCombine for future simplifications. Here we handle some trivial
3216   // cases only.
3217   auto CreateAdd = [&B](Value *X, Value *Y) {
3218     assert(X->getType() == Y->getType() && "Types don't match!");
3219     if (auto *CX = dyn_cast<ConstantInt>(X))
3220       if (CX->isZero())
3221         return Y;
3222     if (auto *CY = dyn_cast<ConstantInt>(Y))
3223       if (CY->isZero())
3224         return X;
3225     return B.CreateAdd(X, Y);
3226   };
3227 
3228   auto CreateMul = [&B](Value *X, Value *Y) {
3229     assert(X->getType() == Y->getType() && "Types don't match!");
3230     if (auto *CX = dyn_cast<ConstantInt>(X))
3231       if (CX->isOne())
3232         return Y;
3233     if (auto *CY = dyn_cast<ConstantInt>(Y))
3234       if (CY->isOne())
3235         return X;
3236     return B.CreateMul(X, Y);
3237   };
3238 
3239   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3240   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3241   // the DomTree is not kept up-to-date for additional blocks generated in the
3242   // vector loop. By using the header as insertion point, we guarantee that the
3243   // expanded instructions dominate all their uses.
3244   auto GetInsertPoint = [this, &B]() {
3245     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3246     if (InsertBB != LoopVectorBody &&
3247         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3248       return LoopVectorBody->getTerminator();
3249     return &*B.GetInsertPoint();
3250   };
3251   switch (ID.getKind()) {
3252   case InductionDescriptor::IK_IntInduction: {
3253     assert(Index->getType() == StartValue->getType() &&
3254            "Index type does not match StartValue type");
3255     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3256       return B.CreateSub(StartValue, Index);
3257     auto *Offset = CreateMul(
3258         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3259     return CreateAdd(StartValue, Offset);
3260   }
3261   case InductionDescriptor::IK_PtrInduction: {
3262     assert(isa<SCEVConstant>(Step) &&
3263            "Expected constant step for pointer induction");
3264     return B.CreateGEP(
3265         StartValue->getType()->getPointerElementType(), StartValue,
3266         CreateMul(Index,
3267                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3268   }
3269   case InductionDescriptor::IK_FpInduction: {
3270     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3271     auto InductionBinOp = ID.getInductionBinOp();
3272     assert(InductionBinOp &&
3273            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3274             InductionBinOp->getOpcode() == Instruction::FSub) &&
3275            "Original bin op should be defined for FP induction");
3276 
3277     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3278 
3279     // Floating point operations had to be 'fast' to enable the induction.
3280     FastMathFlags Flags;
3281     Flags.setFast();
3282 
3283     Value *MulExp = B.CreateFMul(StepValue, Index);
3284     if (isa<Instruction>(MulExp))
3285       // We have to check, the MulExp may be a constant.
3286       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3287 
3288     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3289                                "induction");
3290     if (isa<Instruction>(BOp))
3291       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3292 
3293     return BOp;
3294   }
3295   case InductionDescriptor::IK_NoInduction:
3296     return nullptr;
3297   }
3298   llvm_unreachable("invalid enum");
3299 }
3300 
3301 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3302   LoopScalarBody = OrigLoop->getHeader();
3303   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3304   LoopExitBlock = OrigLoop->getExitBlock();
3305   assert(LoopExitBlock && "Must have an exit block");
3306   assert(LoopVectorPreHeader && "Invalid loop structure");
3307 
3308   LoopMiddleBlock =
3309       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3310                  LI, nullptr, Twine(Prefix) + "middle.block");
3311   LoopScalarPreHeader =
3312       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3313                  nullptr, Twine(Prefix) + "scalar.ph");
3314   // We intentionally don't let SplitBlock to update LoopInfo since
3315   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3316   // LoopVectorBody is explicitly added to the correct place few lines later.
3317   LoopVectorBody =
3318       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3319                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3320 
3321   // Update dominator for loop exit.
3322   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3323 
3324   // Create and register the new vector loop.
3325   Loop *Lp = LI->AllocateLoop();
3326   Loop *ParentLoop = OrigLoop->getParentLoop();
3327 
3328   // Insert the new loop into the loop nest and register the new basic blocks
3329   // before calling any utilities such as SCEV that require valid LoopInfo.
3330   if (ParentLoop) {
3331     ParentLoop->addChildLoop(Lp);
3332   } else {
3333     LI->addTopLevelLoop(Lp);
3334   }
3335   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3336   return Lp;
3337 }
3338 
3339 void InnerLoopVectorizer::createInductionResumeValues(
3340     Loop *L, Value *VectorTripCount,
3341     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3342   assert(VectorTripCount && L && "Expected valid arguments");
3343   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3344           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3345          "Inconsistent information about additional bypass.");
3346   // We are going to resume the execution of the scalar loop.
3347   // Go over all of the induction variables that we found and fix the
3348   // PHIs that are left in the scalar version of the loop.
3349   // The starting values of PHI nodes depend on the counter of the last
3350   // iteration in the vectorized loop.
3351   // If we come from a bypass edge then we need to start from the original
3352   // start value.
3353   for (auto &InductionEntry : Legal->getInductionVars()) {
3354     PHINode *OrigPhi = InductionEntry.first;
3355     InductionDescriptor II = InductionEntry.second;
3356 
3357     // Create phi nodes to merge from the  backedge-taken check block.
3358     PHINode *BCResumeVal =
3359         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3360                         LoopScalarPreHeader->getTerminator());
3361     // Copy original phi DL over to the new one.
3362     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3363     Value *&EndValue = IVEndValues[OrigPhi];
3364     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3365     if (OrigPhi == OldInduction) {
3366       // We know what the end value is.
3367       EndValue = VectorTripCount;
3368     } else {
3369       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3370       Type *StepType = II.getStep()->getType();
3371       Instruction::CastOps CastOp =
3372           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3373       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3374       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3375       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3376       EndValue->setName("ind.end");
3377 
3378       // Compute the end value for the additional bypass (if applicable).
3379       if (AdditionalBypass.first) {
3380         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3381         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3382                                          StepType, true);
3383         CRD =
3384             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3385         EndValueFromAdditionalBypass =
3386             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3387         EndValueFromAdditionalBypass->setName("ind.end");
3388       }
3389     }
3390     // The new PHI merges the original incoming value, in case of a bypass,
3391     // or the value at the end of the vectorized loop.
3392     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3393 
3394     // Fix the scalar body counter (PHI node).
3395     // The old induction's phi node in the scalar body needs the truncated
3396     // value.
3397     for (BasicBlock *BB : LoopBypassBlocks)
3398       BCResumeVal->addIncoming(II.getStartValue(), BB);
3399 
3400     if (AdditionalBypass.first)
3401       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3402                                             EndValueFromAdditionalBypass);
3403 
3404     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3405   }
3406 }
3407 
3408 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3409                                                       MDNode *OrigLoopID) {
3410   assert(L && "Expected valid loop.");
3411 
3412   // The trip counts should be cached by now.
3413   Value *Count = getOrCreateTripCount(L);
3414   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3415 
3416   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3417 
3418   // Add a check in the middle block to see if we have completed
3419   // all of the iterations in the first vector loop.
3420   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3421   // If tail is to be folded, we know we don't need to run the remainder.
3422   Value *CmpN = Builder.getTrue();
3423   if (!Cost->foldTailByMasking()) {
3424     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3425                            VectorTripCount, "cmp.n",
3426                            LoopMiddleBlock->getTerminator());
3427 
3428     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3429     // of the corresponding compare because they may have ended up with
3430     // different line numbers and we want to avoid awkward line stepping while
3431     // debugging. Eg. if the compare has got a line number inside the loop.
3432     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3433   }
3434 
3435   BranchInst *BrInst =
3436       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3437   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3438   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3439 
3440   // Get ready to start creating new instructions into the vectorized body.
3441   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3442          "Inconsistent vector loop preheader");
3443   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3444 
3445   Optional<MDNode *> VectorizedLoopID =
3446       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3447                                       LLVMLoopVectorizeFollowupVectorized});
3448   if (VectorizedLoopID.hasValue()) {
3449     L->setLoopID(VectorizedLoopID.getValue());
3450 
3451     // Do not setAlreadyVectorized if loop attributes have been defined
3452     // explicitly.
3453     return LoopVectorPreHeader;
3454   }
3455 
3456   // Keep all loop hints from the original loop on the vector loop (we'll
3457   // replace the vectorizer-specific hints below).
3458   if (MDNode *LID = OrigLoop->getLoopID())
3459     L->setLoopID(LID);
3460 
3461   LoopVectorizeHints Hints(L, true, *ORE);
3462   Hints.setAlreadyVectorized();
3463 
3464 #ifdef EXPENSIVE_CHECKS
3465   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3466   LI->verify(*DT);
3467 #endif
3468 
3469   return LoopVectorPreHeader;
3470 }
3471 
3472 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3473   /*
3474    In this function we generate a new loop. The new loop will contain
3475    the vectorized instructions while the old loop will continue to run the
3476    scalar remainder.
3477 
3478        [ ] <-- loop iteration number check.
3479     /   |
3480    /    v
3481   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3482   |  /  |
3483   | /   v
3484   ||   [ ]     <-- vector pre header.
3485   |/    |
3486   |     v
3487   |    [  ] \
3488   |    [  ]_|   <-- vector loop.
3489   |     |
3490   |     v
3491   |   -[ ]   <--- middle-block.
3492   |  /  |
3493   | /   v
3494   -|- >[ ]     <--- new preheader.
3495    |    |
3496    |    v
3497    |   [ ] \
3498    |   [ ]_|   <-- old scalar loop to handle remainder.
3499     \   |
3500      \  v
3501       >[ ]     <-- exit block.
3502    ...
3503    */
3504 
3505   // Get the metadata of the original loop before it gets modified.
3506   MDNode *OrigLoopID = OrigLoop->getLoopID();
3507 
3508   // Create an empty vector loop, and prepare basic blocks for the runtime
3509   // checks.
3510   Loop *Lp = createVectorLoopSkeleton("");
3511 
3512   // Now, compare the new count to zero. If it is zero skip the vector loop and
3513   // jump to the scalar loop. This check also covers the case where the
3514   // backedge-taken count is uint##_max: adding one to it will overflow leading
3515   // to an incorrect trip count of zero. In this (rare) case we will also jump
3516   // to the scalar loop.
3517   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3518 
3519   // Generate the code to check any assumptions that we've made for SCEV
3520   // expressions.
3521   emitSCEVChecks(Lp, LoopScalarPreHeader);
3522 
3523   // Generate the code that checks in runtime if arrays overlap. We put the
3524   // checks into a separate block to make the more common case of few elements
3525   // faster.
3526   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3527 
3528   // Some loops have a single integer induction variable, while other loops
3529   // don't. One example is c++ iterators that often have multiple pointer
3530   // induction variables. In the code below we also support a case where we
3531   // don't have a single induction variable.
3532   //
3533   // We try to obtain an induction variable from the original loop as hard
3534   // as possible. However if we don't find one that:
3535   //   - is an integer
3536   //   - counts from zero, stepping by one
3537   //   - is the size of the widest induction variable type
3538   // then we create a new one.
3539   OldInduction = Legal->getPrimaryInduction();
3540   Type *IdxTy = Legal->getWidestInductionType();
3541   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3542   // The loop step is equal to the vectorization factor (num of SIMD elements)
3543   // times the unroll factor (num of SIMD instructions).
3544   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3545   Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3546   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3547   Induction =
3548       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3549                               getDebugLocFromInstOrOperands(OldInduction));
3550 
3551   // Emit phis for the new starting index of the scalar loop.
3552   createInductionResumeValues(Lp, CountRoundDown);
3553 
3554   return completeLoopSkeleton(Lp, OrigLoopID);
3555 }
3556 
3557 // Fix up external users of the induction variable. At this point, we are
3558 // in LCSSA form, with all external PHIs that use the IV having one input value,
3559 // coming from the remainder loop. We need those PHIs to also have a correct
3560 // value for the IV when arriving directly from the middle block.
3561 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3562                                        const InductionDescriptor &II,
3563                                        Value *CountRoundDown, Value *EndValue,
3564                                        BasicBlock *MiddleBlock) {
3565   // There are two kinds of external IV usages - those that use the value
3566   // computed in the last iteration (the PHI) and those that use the penultimate
3567   // value (the value that feeds into the phi from the loop latch).
3568   // We allow both, but they, obviously, have different values.
3569 
3570   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3571 
3572   DenseMap<Value *, Value *> MissingVals;
3573 
3574   // An external user of the last iteration's value should see the value that
3575   // the remainder loop uses to initialize its own IV.
3576   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3577   for (User *U : PostInc->users()) {
3578     Instruction *UI = cast<Instruction>(U);
3579     if (!OrigLoop->contains(UI)) {
3580       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3581       MissingVals[UI] = EndValue;
3582     }
3583   }
3584 
3585   // An external user of the penultimate value need to see EndValue - Step.
3586   // The simplest way to get this is to recompute it from the constituent SCEVs,
3587   // that is Start + (Step * (CRD - 1)).
3588   for (User *U : OrigPhi->users()) {
3589     auto *UI = cast<Instruction>(U);
3590     if (!OrigLoop->contains(UI)) {
3591       const DataLayout &DL =
3592           OrigLoop->getHeader()->getModule()->getDataLayout();
3593       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3594 
3595       IRBuilder<> B(MiddleBlock->getTerminator());
3596       Value *CountMinusOne = B.CreateSub(
3597           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3598       Value *CMO =
3599           !II.getStep()->getType()->isIntegerTy()
3600               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3601                              II.getStep()->getType())
3602               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3603       CMO->setName("cast.cmo");
3604       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3605       Escape->setName("ind.escape");
3606       MissingVals[UI] = Escape;
3607     }
3608   }
3609 
3610   for (auto &I : MissingVals) {
3611     PHINode *PHI = cast<PHINode>(I.first);
3612     // One corner case we have to handle is two IVs "chasing" each-other,
3613     // that is %IV2 = phi [...], [ %IV1, %latch ]
3614     // In this case, if IV1 has an external use, we need to avoid adding both
3615     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3616     // don't already have an incoming value for the middle block.
3617     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3618       PHI->addIncoming(I.second, MiddleBlock);
3619   }
3620 }
3621 
3622 namespace {
3623 
3624 struct CSEDenseMapInfo {
3625   static bool canHandle(const Instruction *I) {
3626     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3627            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3628   }
3629 
3630   static inline Instruction *getEmptyKey() {
3631     return DenseMapInfo<Instruction *>::getEmptyKey();
3632   }
3633 
3634   static inline Instruction *getTombstoneKey() {
3635     return DenseMapInfo<Instruction *>::getTombstoneKey();
3636   }
3637 
3638   static unsigned getHashValue(const Instruction *I) {
3639     assert(canHandle(I) && "Unknown instruction!");
3640     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3641                                                            I->value_op_end()));
3642   }
3643 
3644   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3645     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3646         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3647       return LHS == RHS;
3648     return LHS->isIdenticalTo(RHS);
3649   }
3650 };
3651 
3652 } // end anonymous namespace
3653 
3654 ///Perform cse of induction variable instructions.
3655 static void cse(BasicBlock *BB) {
3656   // Perform simple cse.
3657   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3658   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3659     Instruction *In = &*I++;
3660 
3661     if (!CSEDenseMapInfo::canHandle(In))
3662       continue;
3663 
3664     // Check if we can replace this instruction with any of the
3665     // visited instructions.
3666     if (Instruction *V = CSEMap.lookup(In)) {
3667       In->replaceAllUsesWith(V);
3668       In->eraseFromParent();
3669       continue;
3670     }
3671 
3672     CSEMap[In] = In;
3673   }
3674 }
3675 
3676 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3677                                                        ElementCount VF,
3678                                                        bool &NeedToScalarize) {
3679   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3680   Function *F = CI->getCalledFunction();
3681   Type *ScalarRetTy = CI->getType();
3682   SmallVector<Type *, 4> Tys, ScalarTys;
3683   for (auto &ArgOp : CI->arg_operands())
3684     ScalarTys.push_back(ArgOp->getType());
3685 
3686   // Estimate cost of scalarized vector call. The source operands are assumed
3687   // to be vectors, so we need to extract individual elements from there,
3688   // execute VF scalar calls, and then gather the result into the vector return
3689   // value.
3690   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3691                                                  TTI::TCK_RecipThroughput);
3692   if (VF.isScalar())
3693     return ScalarCallCost;
3694 
3695   // Compute corresponding vector type for return value and arguments.
3696   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3697   for (Type *ScalarTy : ScalarTys)
3698     Tys.push_back(ToVectorTy(ScalarTy, VF));
3699 
3700   // Compute costs of unpacking argument values for the scalar calls and
3701   // packing the return values to a vector.
3702   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3703 
3704   unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3705 
3706   // If we can't emit a vector call for this function, then the currently found
3707   // cost is the cost we need to return.
3708   NeedToScalarize = true;
3709   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3710   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3711 
3712   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3713     return Cost;
3714 
3715   // If the corresponding vector cost is cheaper, return its cost.
3716   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3717                                                  TTI::TCK_RecipThroughput);
3718   if (VectorCallCost < Cost) {
3719     NeedToScalarize = false;
3720     return VectorCallCost;
3721   }
3722   return Cost;
3723 }
3724 
3725 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3726                                                             ElementCount VF) {
3727   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3728   assert(ID && "Expected intrinsic call!");
3729 
3730   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3731   return TTI.getIntrinsicInstrCost(CostAttrs,
3732                                    TargetTransformInfo::TCK_RecipThroughput);
3733 }
3734 
3735 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3736   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3737   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3738   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3739 }
3740 
3741 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3742   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3743   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3744   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3745 }
3746 
3747 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3748   // For every instruction `I` in MinBWs, truncate the operands, create a
3749   // truncated version of `I` and reextend its result. InstCombine runs
3750   // later and will remove any ext/trunc pairs.
3751   SmallPtrSet<Value *, 4> Erased;
3752   for (const auto &KV : Cost->getMinimalBitwidths()) {
3753     // If the value wasn't vectorized, we must maintain the original scalar
3754     // type. The absence of the value from VectorLoopValueMap indicates that it
3755     // wasn't vectorized.
3756     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3757       continue;
3758     for (unsigned Part = 0; Part < UF; ++Part) {
3759       Value *I = getOrCreateVectorValue(KV.first, Part);
3760       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3761         continue;
3762       Type *OriginalTy = I->getType();
3763       Type *ScalarTruncatedTy =
3764           IntegerType::get(OriginalTy->getContext(), KV.second);
3765       auto *TruncatedTy = FixedVectorType::get(
3766           ScalarTruncatedTy,
3767           cast<FixedVectorType>(OriginalTy)->getNumElements());
3768       if (TruncatedTy == OriginalTy)
3769         continue;
3770 
3771       IRBuilder<> B(cast<Instruction>(I));
3772       auto ShrinkOperand = [&](Value *V) -> Value * {
3773         if (auto *ZI = dyn_cast<ZExtInst>(V))
3774           if (ZI->getSrcTy() == TruncatedTy)
3775             return ZI->getOperand(0);
3776         return B.CreateZExtOrTrunc(V, TruncatedTy);
3777       };
3778 
3779       // The actual instruction modification depends on the instruction type,
3780       // unfortunately.
3781       Value *NewI = nullptr;
3782       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3783         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3784                              ShrinkOperand(BO->getOperand(1)));
3785 
3786         // Any wrapping introduced by shrinking this operation shouldn't be
3787         // considered undefined behavior. So, we can't unconditionally copy
3788         // arithmetic wrapping flags to NewI.
3789         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3790       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3791         NewI =
3792             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3793                          ShrinkOperand(CI->getOperand(1)));
3794       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3795         NewI = B.CreateSelect(SI->getCondition(),
3796                               ShrinkOperand(SI->getTrueValue()),
3797                               ShrinkOperand(SI->getFalseValue()));
3798       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3799         switch (CI->getOpcode()) {
3800         default:
3801           llvm_unreachable("Unhandled cast!");
3802         case Instruction::Trunc:
3803           NewI = ShrinkOperand(CI->getOperand(0));
3804           break;
3805         case Instruction::SExt:
3806           NewI = B.CreateSExtOrTrunc(
3807               CI->getOperand(0),
3808               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3809           break;
3810         case Instruction::ZExt:
3811           NewI = B.CreateZExtOrTrunc(
3812               CI->getOperand(0),
3813               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3814           break;
3815         }
3816       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3817         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3818                              ->getNumElements();
3819         auto *O0 = B.CreateZExtOrTrunc(
3820             SI->getOperand(0),
3821             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3822         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3823                              ->getNumElements();
3824         auto *O1 = B.CreateZExtOrTrunc(
3825             SI->getOperand(1),
3826             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3827 
3828         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3829       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3830         // Don't do anything with the operands, just extend the result.
3831         continue;
3832       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3833         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3834                             ->getNumElements();
3835         auto *O0 = B.CreateZExtOrTrunc(
3836             IE->getOperand(0),
3837             FixedVectorType::get(ScalarTruncatedTy, Elements));
3838         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3839         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3840       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3841         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3842                             ->getNumElements();
3843         auto *O0 = B.CreateZExtOrTrunc(
3844             EE->getOperand(0),
3845             FixedVectorType::get(ScalarTruncatedTy, Elements));
3846         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3847       } else {
3848         // If we don't know what to do, be conservative and don't do anything.
3849         continue;
3850       }
3851 
3852       // Lastly, extend the result.
3853       NewI->takeName(cast<Instruction>(I));
3854       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3855       I->replaceAllUsesWith(Res);
3856       cast<Instruction>(I)->eraseFromParent();
3857       Erased.insert(I);
3858       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3859     }
3860   }
3861 
3862   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3863   for (const auto &KV : Cost->getMinimalBitwidths()) {
3864     // If the value wasn't vectorized, we must maintain the original scalar
3865     // type. The absence of the value from VectorLoopValueMap indicates that it
3866     // wasn't vectorized.
3867     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3868       continue;
3869     for (unsigned Part = 0; Part < UF; ++Part) {
3870       Value *I = getOrCreateVectorValue(KV.first, Part);
3871       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3872       if (Inst && Inst->use_empty()) {
3873         Value *NewI = Inst->getOperand(0);
3874         Inst->eraseFromParent();
3875         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3876       }
3877     }
3878   }
3879 }
3880 
3881 void InnerLoopVectorizer::fixVectorizedLoop() {
3882   // Insert truncates and extends for any truncated instructions as hints to
3883   // InstCombine.
3884   if (VF.isVector())
3885     truncateToMinimalBitwidths();
3886 
3887   // Fix widened non-induction PHIs by setting up the PHI operands.
3888   if (OrigPHIsToFix.size()) {
3889     assert(EnableVPlanNativePath &&
3890            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3891     fixNonInductionPHIs();
3892   }
3893 
3894   // At this point every instruction in the original loop is widened to a
3895   // vector form. Now we need to fix the recurrences in the loop. These PHI
3896   // nodes are currently empty because we did not want to introduce cycles.
3897   // This is the second stage of vectorizing recurrences.
3898   fixCrossIterationPHIs();
3899 
3900   // Forget the original basic block.
3901   PSE.getSE()->forgetLoop(OrigLoop);
3902 
3903   // Fix-up external users of the induction variables.
3904   for (auto &Entry : Legal->getInductionVars())
3905     fixupIVUsers(Entry.first, Entry.second,
3906                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3907                  IVEndValues[Entry.first], LoopMiddleBlock);
3908 
3909   fixLCSSAPHIs();
3910   for (Instruction *PI : PredicatedInstructions)
3911     sinkScalarOperands(&*PI);
3912 
3913   // Remove redundant induction instructions.
3914   cse(LoopVectorBody);
3915 
3916   // Set/update profile weights for the vector and remainder loops as original
3917   // loop iterations are now distributed among them. Note that original loop
3918   // represented by LoopScalarBody becomes remainder loop after vectorization.
3919   //
3920   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3921   // end up getting slightly roughened result but that should be OK since
3922   // profile is not inherently precise anyway. Note also possible bypass of
3923   // vector code caused by legality checks is ignored, assigning all the weight
3924   // to the vector loop, optimistically.
3925   //
3926   // For scalable vectorization we can't know at compile time how many iterations
3927   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3928   // vscale of '1'.
3929   setProfileInfoAfterUnrolling(
3930       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3931       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3932 }
3933 
3934 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3935   // In order to support recurrences we need to be able to vectorize Phi nodes.
3936   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3937   // stage #2: We now need to fix the recurrences by adding incoming edges to
3938   // the currently empty PHI nodes. At this point every instruction in the
3939   // original loop is widened to a vector form so we can use them to construct
3940   // the incoming edges.
3941   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3942     // Handle first-order recurrences and reductions that need to be fixed.
3943     if (Legal->isFirstOrderRecurrence(&Phi))
3944       fixFirstOrderRecurrence(&Phi);
3945     else if (Legal->isReductionVariable(&Phi))
3946       fixReduction(&Phi);
3947   }
3948 }
3949 
3950 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3951   // This is the second phase of vectorizing first-order recurrences. An
3952   // overview of the transformation is described below. Suppose we have the
3953   // following loop.
3954   //
3955   //   for (int i = 0; i < n; ++i)
3956   //     b[i] = a[i] - a[i - 1];
3957   //
3958   // There is a first-order recurrence on "a". For this loop, the shorthand
3959   // scalar IR looks like:
3960   //
3961   //   scalar.ph:
3962   //     s_init = a[-1]
3963   //     br scalar.body
3964   //
3965   //   scalar.body:
3966   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3967   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3968   //     s2 = a[i]
3969   //     b[i] = s2 - s1
3970   //     br cond, scalar.body, ...
3971   //
3972   // In this example, s1 is a recurrence because it's value depends on the
3973   // previous iteration. In the first phase of vectorization, we created a
3974   // temporary value for s1. We now complete the vectorization and produce the
3975   // shorthand vector IR shown below (for VF = 4, UF = 1).
3976   //
3977   //   vector.ph:
3978   //     v_init = vector(..., ..., ..., a[-1])
3979   //     br vector.body
3980   //
3981   //   vector.body
3982   //     i = phi [0, vector.ph], [i+4, vector.body]
3983   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3984   //     v2 = a[i, i+1, i+2, i+3];
3985   //     v3 = vector(v1(3), v2(0, 1, 2))
3986   //     b[i, i+1, i+2, i+3] = v2 - v3
3987   //     br cond, vector.body, middle.block
3988   //
3989   //   middle.block:
3990   //     x = v2(3)
3991   //     br scalar.ph
3992   //
3993   //   scalar.ph:
3994   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3995   //     br scalar.body
3996   //
3997   // After execution completes the vector loop, we extract the next value of
3998   // the recurrence (x) to use as the initial value in the scalar loop.
3999 
4000   // Get the original loop preheader and single loop latch.
4001   auto *Preheader = OrigLoop->getLoopPreheader();
4002   auto *Latch = OrigLoop->getLoopLatch();
4003 
4004   // Get the initial and previous values of the scalar recurrence.
4005   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4006   auto *Previous = Phi->getIncomingValueForBlock(Latch);
4007 
4008   // Create a vector from the initial value.
4009   auto *VectorInit = ScalarInit;
4010   if (VF.isVector()) {
4011     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4012     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4013     VectorInit = Builder.CreateInsertElement(
4014         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4015         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
4016   }
4017 
4018   // We constructed a temporary phi node in the first phase of vectorization.
4019   // This phi node will eventually be deleted.
4020   Builder.SetInsertPoint(
4021       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4022 
4023   // Create a phi node for the new recurrence. The current value will either be
4024   // the initial value inserted into a vector or loop-varying vector value.
4025   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4026   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4027 
4028   // Get the vectorized previous value of the last part UF - 1. It appears last
4029   // among all unrolled iterations, due to the order of their construction.
4030   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4031 
4032   // Find and set the insertion point after the previous value if it is an
4033   // instruction.
4034   BasicBlock::iterator InsertPt;
4035   // Note that the previous value may have been constant-folded so it is not
4036   // guaranteed to be an instruction in the vector loop.
4037   // FIXME: Loop invariant values do not form recurrences. We should deal with
4038   //        them earlier.
4039   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4040     InsertPt = LoopVectorBody->getFirstInsertionPt();
4041   else {
4042     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4043     if (isa<PHINode>(PreviousLastPart))
4044       // If the previous value is a phi node, we should insert after all the phi
4045       // nodes in the block containing the PHI to avoid breaking basic block
4046       // verification. Note that the basic block may be different to
4047       // LoopVectorBody, in case we predicate the loop.
4048       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4049     else
4050       InsertPt = ++PreviousInst->getIterator();
4051   }
4052   Builder.SetInsertPoint(&*InsertPt);
4053 
4054   // We will construct a vector for the recurrence by combining the values for
4055   // the current and previous iterations. This is the required shuffle mask.
4056   assert(!VF.isScalable());
4057   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4058   ShuffleMask[0] = VF.getKnownMinValue() - 1;
4059   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4060     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4061 
4062   // The vector from which to take the initial value for the current iteration
4063   // (actual or unrolled). Initially, this is the vector phi node.
4064   Value *Incoming = VecPhi;
4065 
4066   // Shuffle the current and previous vector and update the vector parts.
4067   for (unsigned Part = 0; Part < UF; ++Part) {
4068     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4069     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4070     auto *Shuffle =
4071         VF.isVector()
4072             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4073             : Incoming;
4074     PhiPart->replaceAllUsesWith(Shuffle);
4075     cast<Instruction>(PhiPart)->eraseFromParent();
4076     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4077     Incoming = PreviousPart;
4078   }
4079 
4080   // Fix the latch value of the new recurrence in the vector loop.
4081   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4082 
4083   // Extract the last vector element in the middle block. This will be the
4084   // initial value for the recurrence when jumping to the scalar loop.
4085   auto *ExtractForScalar = Incoming;
4086   if (VF.isVector()) {
4087     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4088     ExtractForScalar = Builder.CreateExtractElement(
4089         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4090         "vector.recur.extract");
4091   }
4092   // Extract the second last element in the middle block if the
4093   // Phi is used outside the loop. We need to extract the phi itself
4094   // and not the last element (the phi update in the current iteration). This
4095   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4096   // when the scalar loop is not run at all.
4097   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4098   if (VF.isVector())
4099     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4100         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4101         "vector.recur.extract.for.phi");
4102   // When loop is unrolled without vectorizing, initialize
4103   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4104   // `Incoming`. This is analogous to the vectorized case above: extracting the
4105   // second last element when VF > 1.
4106   else if (UF > 1)
4107     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4108 
4109   // Fix the initial value of the original recurrence in the scalar loop.
4110   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4111   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4112   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4113     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4114     Start->addIncoming(Incoming, BB);
4115   }
4116 
4117   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4118   Phi->setName("scalar.recur");
4119 
4120   // Finally, fix users of the recurrence outside the loop. The users will need
4121   // either the last value of the scalar recurrence or the last value of the
4122   // vector recurrence we extracted in the middle block. Since the loop is in
4123   // LCSSA form, we just need to find all the phi nodes for the original scalar
4124   // recurrence in the exit block, and then add an edge for the middle block.
4125   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4126     if (LCSSAPhi.getIncomingValue(0) == Phi) {
4127       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4128     }
4129   }
4130 }
4131 
4132 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
4133   Constant *Zero = Builder.getInt32(0);
4134 
4135   // Get it's reduction variable descriptor.
4136   assert(Legal->isReductionVariable(Phi) &&
4137          "Unable to find the reduction variable");
4138   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4139 
4140   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4141   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4142   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4143   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
4144     RdxDesc.getMinMaxRecurrenceKind();
4145   setDebugLocFromInst(Builder, ReductionStartValue);
4146   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4147 
4148   // We need to generate a reduction vector from the incoming scalar.
4149   // To do so, we need to generate the 'identity' vector and override
4150   // one of the elements with the incoming scalar reduction. We need
4151   // to do it in the vector-loop preheader.
4152   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4153 
4154   // This is the vector-clone of the value that leaves the loop.
4155   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4156 
4157   // Find the reduction identity variable. Zero for addition, or, xor,
4158   // one for multiplication, -1 for And.
4159   Value *Identity;
4160   Value *VectorStart;
4161   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
4162       RK == RecurrenceDescriptor::RK_FloatMinMax) {
4163     // MinMax reduction have the start value as their identify.
4164     if (VF.isScalar() || IsInLoopReductionPhi) {
4165       VectorStart = Identity = ReductionStartValue;
4166     } else {
4167       VectorStart = Identity =
4168         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
4169     }
4170   } else {
4171     // Handle other reduction kinds:
4172     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
4173         RK, MinMaxKind, VecTy->getScalarType());
4174     if (VF.isScalar() || IsInLoopReductionPhi) {
4175       Identity = Iden;
4176       // This vector is the Identity vector where the first element is the
4177       // incoming scalar reduction.
4178       VectorStart = ReductionStartValue;
4179     } else {
4180       Identity = ConstantVector::getSplat(VF, Iden);
4181 
4182       // This vector is the Identity vector where the first element is the
4183       // incoming scalar reduction.
4184       VectorStart =
4185         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
4186     }
4187   }
4188 
4189   // Wrap flags are in general invalid after vectorization, clear them.
4190   clearReductionWrapFlags(RdxDesc);
4191 
4192   // Fix the vector-loop phi.
4193 
4194   // Reductions do not have to start at zero. They can start with
4195   // any loop invariant values.
4196   BasicBlock *Latch = OrigLoop->getLoopLatch();
4197   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4198 
4199   for (unsigned Part = 0; Part < UF; ++Part) {
4200     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4201     Value *Val = getOrCreateVectorValue(LoopVal, Part);
4202     // Make sure to add the reduction start value only to the
4203     // first unroll part.
4204     Value *StartVal = (Part == 0) ? VectorStart : Identity;
4205     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
4206     cast<PHINode>(VecRdxPhi)
4207       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4208   }
4209 
4210   // Before each round, move the insertion point right between
4211   // the PHIs and the values we are going to write.
4212   // This allows us to write both PHINodes and the extractelement
4213   // instructions.
4214   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4215 
4216   setDebugLocFromInst(Builder, LoopExitInst);
4217 
4218   // If tail is folded by masking, the vector value to leave the loop should be
4219   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4220   // instead of the former. For an inloop reduction the reduction will already
4221   // be predicated, and does not need to be handled here.
4222   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4223     for (unsigned Part = 0; Part < UF; ++Part) {
4224       Value *VecLoopExitInst =
4225           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4226       Value *Sel = nullptr;
4227       for (User *U : VecLoopExitInst->users()) {
4228         if (isa<SelectInst>(U)) {
4229           assert(!Sel && "Reduction exit feeding two selects");
4230           Sel = U;
4231         } else
4232           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4233       }
4234       assert(Sel && "Reduction exit feeds no select");
4235       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4236 
4237       // If the target can create a predicated operator for the reduction at no
4238       // extra cost in the loop (for example a predicated vadd), it can be
4239       // cheaper for the select to remain in the loop than be sunk out of it,
4240       // and so use the select value for the phi instead of the old
4241       // LoopExitValue.
4242       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4243       if (PreferPredicatedReductionSelect ||
4244           TTI->preferPredicatedReductionSelect(
4245               RdxDesc.getRecurrenceBinOp(), Phi->getType(),
4246               TargetTransformInfo::ReductionFlags())) {
4247         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4248         VecRdxPhi->setIncomingValueForBlock(
4249             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4250       }
4251     }
4252   }
4253 
4254   // If the vector reduction can be performed in a smaller type, we truncate
4255   // then extend the loop exit value to enable InstCombine to evaluate the
4256   // entire expression in the smaller type.
4257   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4258     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4259     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4260     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4261     Builder.SetInsertPoint(
4262         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4263     VectorParts RdxParts(UF);
4264     for (unsigned Part = 0; Part < UF; ++Part) {
4265       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4266       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4267       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4268                                         : Builder.CreateZExt(Trunc, VecTy);
4269       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4270            UI != RdxParts[Part]->user_end();)
4271         if (*UI != Trunc) {
4272           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4273           RdxParts[Part] = Extnd;
4274         } else {
4275           ++UI;
4276         }
4277     }
4278     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4279     for (unsigned Part = 0; Part < UF; ++Part) {
4280       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4281       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4282     }
4283   }
4284 
4285   // Reduce all of the unrolled parts into a single vector.
4286   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4287   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4288 
4289   // The middle block terminator has already been assigned a DebugLoc here (the
4290   // OrigLoop's single latch terminator). We want the whole middle block to
4291   // appear to execute on this line because: (a) it is all compiler generated,
4292   // (b) these instructions are always executed after evaluating the latch
4293   // conditional branch, and (c) other passes may add new predecessors which
4294   // terminate on this line. This is the easiest way to ensure we don't
4295   // accidentally cause an extra step back into the loop while debugging.
4296   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4297   for (unsigned Part = 1; Part < UF; ++Part) {
4298     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4299     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4300       // Floating point operations had to be 'fast' to enable the reduction.
4301       ReducedPartRdx = addFastMathFlag(
4302           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4303                               ReducedPartRdx, "bin.rdx"),
4304           RdxDesc.getFastMathFlags());
4305     else
4306       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4307                                       RdxPart);
4308   }
4309 
4310   // Create the reduction after the loop. Note that inloop reductions create the
4311   // target reduction in the loop using a Reduction recipe.
4312   if (VF.isVector() && !IsInLoopReductionPhi) {
4313     bool NoNaN = Legal->hasFunNoNaNAttr();
4314     ReducedPartRdx =
4315         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4316     // If the reduction can be performed in a smaller type, we need to extend
4317     // the reduction to the wider type before we branch to the original loop.
4318     if (Phi->getType() != RdxDesc.getRecurrenceType())
4319       ReducedPartRdx =
4320         RdxDesc.isSigned()
4321         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4322         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4323   }
4324 
4325   // Create a phi node that merges control-flow from the backedge-taken check
4326   // block and the middle block.
4327   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4328                                         LoopScalarPreHeader->getTerminator());
4329   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4330     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4331   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4332 
4333   // Now, we need to fix the users of the reduction variable
4334   // inside and outside of the scalar remainder loop.
4335   // We know that the loop is in LCSSA form. We need to update the
4336   // PHI nodes in the exit blocks.
4337   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4338     // All PHINodes need to have a single entry edge, or two if
4339     // we already fixed them.
4340     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4341 
4342     // We found a reduction value exit-PHI. Update it with the
4343     // incoming bypass edge.
4344     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4345       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4346   } // end of the LCSSA phi scan.
4347 
4348     // Fix the scalar loop reduction variable with the incoming reduction sum
4349     // from the vector body and from the backedge value.
4350   int IncomingEdgeBlockIdx =
4351     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4352   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4353   // Pick the other block.
4354   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4355   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4356   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4357 }
4358 
4359 void InnerLoopVectorizer::clearReductionWrapFlags(
4360     RecurrenceDescriptor &RdxDesc) {
4361   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4362   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4363       RK != RecurrenceDescriptor::RK_IntegerMult)
4364     return;
4365 
4366   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4367   assert(LoopExitInstr && "null loop exit instruction");
4368   SmallVector<Instruction *, 8> Worklist;
4369   SmallPtrSet<Instruction *, 8> Visited;
4370   Worklist.push_back(LoopExitInstr);
4371   Visited.insert(LoopExitInstr);
4372 
4373   while (!Worklist.empty()) {
4374     Instruction *Cur = Worklist.pop_back_val();
4375     if (isa<OverflowingBinaryOperator>(Cur))
4376       for (unsigned Part = 0; Part < UF; ++Part) {
4377         Value *V = getOrCreateVectorValue(Cur, Part);
4378         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4379       }
4380 
4381     for (User *U : Cur->users()) {
4382       Instruction *UI = cast<Instruction>(U);
4383       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4384           Visited.insert(UI).second)
4385         Worklist.push_back(UI);
4386     }
4387   }
4388 }
4389 
4390 void InnerLoopVectorizer::fixLCSSAPHIs() {
4391   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4392     if (LCSSAPhi.getNumIncomingValues() == 1) {
4393       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4394       // Non-instruction incoming values will have only one value.
4395       unsigned LastLane = 0;
4396       if (isa<Instruction>(IncomingValue))
4397         LastLane = Cost->isUniformAfterVectorization(
4398                        cast<Instruction>(IncomingValue), VF)
4399                        ? 0
4400                        : VF.getKnownMinValue() - 1;
4401       assert((!VF.isScalable() || LastLane == 0) &&
4402              "scalable vectors dont support non-uniform scalars yet");
4403       // Can be a loop invariant incoming value or the last scalar value to be
4404       // extracted from the vectorized loop.
4405       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4406       Value *lastIncomingValue =
4407           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4408       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4409     }
4410   }
4411 }
4412 
4413 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4414   // The basic block and loop containing the predicated instruction.
4415   auto *PredBB = PredInst->getParent();
4416   auto *VectorLoop = LI->getLoopFor(PredBB);
4417 
4418   // Initialize a worklist with the operands of the predicated instruction.
4419   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4420 
4421   // Holds instructions that we need to analyze again. An instruction may be
4422   // reanalyzed if we don't yet know if we can sink it or not.
4423   SmallVector<Instruction *, 8> InstsToReanalyze;
4424 
4425   // Returns true if a given use occurs in the predicated block. Phi nodes use
4426   // their operands in their corresponding predecessor blocks.
4427   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4428     auto *I = cast<Instruction>(U.getUser());
4429     BasicBlock *BB = I->getParent();
4430     if (auto *Phi = dyn_cast<PHINode>(I))
4431       BB = Phi->getIncomingBlock(
4432           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4433     return BB == PredBB;
4434   };
4435 
4436   // Iteratively sink the scalarized operands of the predicated instruction
4437   // into the block we created for it. When an instruction is sunk, it's
4438   // operands are then added to the worklist. The algorithm ends after one pass
4439   // through the worklist doesn't sink a single instruction.
4440   bool Changed;
4441   do {
4442     // Add the instructions that need to be reanalyzed to the worklist, and
4443     // reset the changed indicator.
4444     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4445     InstsToReanalyze.clear();
4446     Changed = false;
4447 
4448     while (!Worklist.empty()) {
4449       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4450 
4451       // We can't sink an instruction if it is a phi node, is already in the
4452       // predicated block, is not in the loop, or may have side effects.
4453       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4454           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4455         continue;
4456 
4457       // It's legal to sink the instruction if all its uses occur in the
4458       // predicated block. Otherwise, there's nothing to do yet, and we may
4459       // need to reanalyze the instruction.
4460       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4461         InstsToReanalyze.push_back(I);
4462         continue;
4463       }
4464 
4465       // Move the instruction to the beginning of the predicated block, and add
4466       // it's operands to the worklist.
4467       I->moveBefore(&*PredBB->getFirstInsertionPt());
4468       Worklist.insert(I->op_begin(), I->op_end());
4469 
4470       // The sinking may have enabled other instructions to be sunk, so we will
4471       // need to iterate.
4472       Changed = true;
4473     }
4474   } while (Changed);
4475 }
4476 
4477 void InnerLoopVectorizer::fixNonInductionPHIs() {
4478   for (PHINode *OrigPhi : OrigPHIsToFix) {
4479     PHINode *NewPhi =
4480         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4481     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4482 
4483     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4484         predecessors(OrigPhi->getParent()));
4485     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4486         predecessors(NewPhi->getParent()));
4487     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4488            "Scalar and Vector BB should have the same number of predecessors");
4489 
4490     // The insertion point in Builder may be invalidated by the time we get
4491     // here. Force the Builder insertion point to something valid so that we do
4492     // not run into issues during insertion point restore in
4493     // getOrCreateVectorValue calls below.
4494     Builder.SetInsertPoint(NewPhi);
4495 
4496     // The predecessor order is preserved and we can rely on mapping between
4497     // scalar and vector block predecessors.
4498     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4499       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4500 
4501       // When looking up the new scalar/vector values to fix up, use incoming
4502       // values from original phi.
4503       Value *ScIncV =
4504           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4505 
4506       // Scalar incoming value may need a broadcast
4507       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4508       NewPhi->addIncoming(NewIncV, NewPredBB);
4509     }
4510   }
4511 }
4512 
4513 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4514                                    VPUser &Operands, unsigned UF,
4515                                    ElementCount VF, bool IsPtrLoopInvariant,
4516                                    SmallBitVector &IsIndexLoopInvariant,
4517                                    VPTransformState &State) {
4518   // Construct a vector GEP by widening the operands of the scalar GEP as
4519   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4520   // results in a vector of pointers when at least one operand of the GEP
4521   // is vector-typed. Thus, to keep the representation compact, we only use
4522   // vector-typed operands for loop-varying values.
4523 
4524   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4525     // If we are vectorizing, but the GEP has only loop-invariant operands,
4526     // the GEP we build (by only using vector-typed operands for
4527     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4528     // produce a vector of pointers, we need to either arbitrarily pick an
4529     // operand to broadcast, or broadcast a clone of the original GEP.
4530     // Here, we broadcast a clone of the original.
4531     //
4532     // TODO: If at some point we decide to scalarize instructions having
4533     //       loop-invariant operands, this special case will no longer be
4534     //       required. We would add the scalarization decision to
4535     //       collectLoopScalars() and teach getVectorValue() to broadcast
4536     //       the lane-zero scalar value.
4537     auto *Clone = Builder.Insert(GEP->clone());
4538     for (unsigned Part = 0; Part < UF; ++Part) {
4539       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4540       State.set(VPDef, GEP, EntryPart, Part);
4541       addMetadata(EntryPart, GEP);
4542     }
4543   } else {
4544     // If the GEP has at least one loop-varying operand, we are sure to
4545     // produce a vector of pointers. But if we are only unrolling, we want
4546     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4547     // produce with the code below will be scalar (if VF == 1) or vector
4548     // (otherwise). Note that for the unroll-only case, we still maintain
4549     // values in the vector mapping with initVector, as we do for other
4550     // instructions.
4551     for (unsigned Part = 0; Part < UF; ++Part) {
4552       // The pointer operand of the new GEP. If it's loop-invariant, we
4553       // won't broadcast it.
4554       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4555                                      : State.get(Operands.getOperand(0), Part);
4556 
4557       // Collect all the indices for the new GEP. If any index is
4558       // loop-invariant, we won't broadcast it.
4559       SmallVector<Value *, 4> Indices;
4560       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4561         VPValue *Operand = Operands.getOperand(I);
4562         if (IsIndexLoopInvariant[I - 1])
4563           Indices.push_back(State.get(Operand, {0, 0}));
4564         else
4565           Indices.push_back(State.get(Operand, Part));
4566       }
4567 
4568       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4569       // but it should be a vector, otherwise.
4570       auto *NewGEP =
4571           GEP->isInBounds()
4572               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4573                                           Indices)
4574               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4575       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4576              "NewGEP is not a pointer vector");
4577       State.set(VPDef, GEP, NewGEP, Part);
4578       addMetadata(NewGEP, GEP);
4579     }
4580   }
4581 }
4582 
4583 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4584                                               ElementCount VF) {
4585   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4586   PHINode *P = cast<PHINode>(PN);
4587   if (EnableVPlanNativePath) {
4588     // Currently we enter here in the VPlan-native path for non-induction
4589     // PHIs where all control flow is uniform. We simply widen these PHIs.
4590     // Create a vector phi with no operands - the vector phi operands will be
4591     // set at the end of vector code generation.
4592     Type *VecTy =
4593         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4594     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4595     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4596     OrigPHIsToFix.push_back(P);
4597 
4598     return;
4599   }
4600 
4601   assert(PN->getParent() == OrigLoop->getHeader() &&
4602          "Non-header phis should have been handled elsewhere");
4603 
4604   // In order to support recurrences we need to be able to vectorize Phi nodes.
4605   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4606   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4607   // this value when we vectorize all of the instructions that use the PHI.
4608   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4609     for (unsigned Part = 0; Part < UF; ++Part) {
4610       // This is phase one of vectorizing PHIs.
4611       bool ScalarPHI =
4612           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4613       Type *VecTy =
4614           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4615       Value *EntryPart = PHINode::Create(
4616           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4617       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4618     }
4619     return;
4620   }
4621 
4622   setDebugLocFromInst(Builder, P);
4623 
4624   // This PHINode must be an induction variable.
4625   // Make sure that we know about it.
4626   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4627 
4628   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4629   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4630 
4631   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4632   // which can be found from the original scalar operations.
4633   switch (II.getKind()) {
4634   case InductionDescriptor::IK_NoInduction:
4635     llvm_unreachable("Unknown induction");
4636   case InductionDescriptor::IK_IntInduction:
4637   case InductionDescriptor::IK_FpInduction:
4638     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4639   case InductionDescriptor::IK_PtrInduction: {
4640     // Handle the pointer induction variable case.
4641     assert(P->getType()->isPointerTy() && "Unexpected type.");
4642 
4643     if (Cost->isScalarAfterVectorization(P, VF)) {
4644       // This is the normalized GEP that starts counting at zero.
4645       Value *PtrInd =
4646           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4647       // Determine the number of scalars we need to generate for each unroll
4648       // iteration. If the instruction is uniform, we only need to generate the
4649       // first lane. Otherwise, we generate all VF values.
4650       unsigned Lanes =
4651           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4652       for (unsigned Part = 0; Part < UF; ++Part) {
4653         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4654           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4655                                            Lane + Part * VF.getKnownMinValue());
4656           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4657           Value *SclrGep =
4658               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4659           SclrGep->setName("next.gep");
4660           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4661         }
4662       }
4663       return;
4664     }
4665     assert(isa<SCEVConstant>(II.getStep()) &&
4666            "Induction step not a SCEV constant!");
4667     Type *PhiType = II.getStep()->getType();
4668 
4669     // Build a pointer phi
4670     Value *ScalarStartValue = II.getStartValue();
4671     Type *ScStValueType = ScalarStartValue->getType();
4672     PHINode *NewPointerPhi =
4673         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4674     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4675 
4676     // A pointer induction, performed by using a gep
4677     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4678     Instruction *InductionLoc = LoopLatch->getTerminator();
4679     const SCEV *ScalarStep = II.getStep();
4680     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4681     Value *ScalarStepValue =
4682         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4683     Value *InductionGEP = GetElementPtrInst::Create(
4684         ScStValueType->getPointerElementType(), NewPointerPhi,
4685         Builder.CreateMul(
4686             ScalarStepValue,
4687             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4688         "ptr.ind", InductionLoc);
4689     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4690 
4691     // Create UF many actual address geps that use the pointer
4692     // phi as base and a vectorized version of the step value
4693     // (<step*0, ..., step*N>) as offset.
4694     for (unsigned Part = 0; Part < UF; ++Part) {
4695       SmallVector<Constant *, 8> Indices;
4696       // Create a vector of consecutive numbers from zero to VF.
4697       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4698         Indices.push_back(
4699             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4700       Constant *StartOffset = ConstantVector::get(Indices);
4701 
4702       Value *GEP = Builder.CreateGEP(
4703           ScStValueType->getPointerElementType(), NewPointerPhi,
4704           Builder.CreateMul(
4705               StartOffset,
4706               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4707               "vector.gep"));
4708       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4709     }
4710   }
4711   }
4712 }
4713 
4714 /// A helper function for checking whether an integer division-related
4715 /// instruction may divide by zero (in which case it must be predicated if
4716 /// executed conditionally in the scalar code).
4717 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4718 /// Non-zero divisors that are non compile-time constants will not be
4719 /// converted into multiplication, so we will still end up scalarizing
4720 /// the division, but can do so w/o predication.
4721 static bool mayDivideByZero(Instruction &I) {
4722   assert((I.getOpcode() == Instruction::UDiv ||
4723           I.getOpcode() == Instruction::SDiv ||
4724           I.getOpcode() == Instruction::URem ||
4725           I.getOpcode() == Instruction::SRem) &&
4726          "Unexpected instruction");
4727   Value *Divisor = I.getOperand(1);
4728   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4729   return !CInt || CInt->isZero();
4730 }
4731 
4732 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4733                                            VPUser &User,
4734                                            VPTransformState &State) {
4735   switch (I.getOpcode()) {
4736   case Instruction::Call:
4737   case Instruction::Br:
4738   case Instruction::PHI:
4739   case Instruction::GetElementPtr:
4740   case Instruction::Select:
4741     llvm_unreachable("This instruction is handled by a different recipe.");
4742   case Instruction::UDiv:
4743   case Instruction::SDiv:
4744   case Instruction::SRem:
4745   case Instruction::URem:
4746   case Instruction::Add:
4747   case Instruction::FAdd:
4748   case Instruction::Sub:
4749   case Instruction::FSub:
4750   case Instruction::FNeg:
4751   case Instruction::Mul:
4752   case Instruction::FMul:
4753   case Instruction::FDiv:
4754   case Instruction::FRem:
4755   case Instruction::Shl:
4756   case Instruction::LShr:
4757   case Instruction::AShr:
4758   case Instruction::And:
4759   case Instruction::Or:
4760   case Instruction::Xor: {
4761     // Just widen unops and binops.
4762     setDebugLocFromInst(Builder, &I);
4763 
4764     for (unsigned Part = 0; Part < UF; ++Part) {
4765       SmallVector<Value *, 2> Ops;
4766       for (VPValue *VPOp : User.operands())
4767         Ops.push_back(State.get(VPOp, Part));
4768 
4769       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4770 
4771       if (auto *VecOp = dyn_cast<Instruction>(V))
4772         VecOp->copyIRFlags(&I);
4773 
4774       // Use this vector value for all users of the original instruction.
4775       State.set(Def, &I, V, Part);
4776       addMetadata(V, &I);
4777     }
4778 
4779     break;
4780   }
4781   case Instruction::ICmp:
4782   case Instruction::FCmp: {
4783     // Widen compares. Generate vector compares.
4784     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4785     auto *Cmp = cast<CmpInst>(&I);
4786     setDebugLocFromInst(Builder, Cmp);
4787     for (unsigned Part = 0; Part < UF; ++Part) {
4788       Value *A = State.get(User.getOperand(0), Part);
4789       Value *B = State.get(User.getOperand(1), Part);
4790       Value *C = nullptr;
4791       if (FCmp) {
4792         // Propagate fast math flags.
4793         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4794         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4795         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4796       } else {
4797         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4798       }
4799       State.set(Def, &I, C, Part);
4800       addMetadata(C, &I);
4801     }
4802 
4803     break;
4804   }
4805 
4806   case Instruction::ZExt:
4807   case Instruction::SExt:
4808   case Instruction::FPToUI:
4809   case Instruction::FPToSI:
4810   case Instruction::FPExt:
4811   case Instruction::PtrToInt:
4812   case Instruction::IntToPtr:
4813   case Instruction::SIToFP:
4814   case Instruction::UIToFP:
4815   case Instruction::Trunc:
4816   case Instruction::FPTrunc:
4817   case Instruction::BitCast: {
4818     auto *CI = cast<CastInst>(&I);
4819     setDebugLocFromInst(Builder, CI);
4820 
4821     /// Vectorize casts.
4822     Type *DestTy =
4823         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4824 
4825     for (unsigned Part = 0; Part < UF; ++Part) {
4826       Value *A = State.get(User.getOperand(0), Part);
4827       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4828       State.set(Def, &I, Cast, Part);
4829       addMetadata(Cast, &I);
4830     }
4831     break;
4832   }
4833   default:
4834     // This instruction is not vectorized by simple widening.
4835     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4836     llvm_unreachable("Unhandled instruction!");
4837   } // end of switch.
4838 }
4839 
4840 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4841                                                VPUser &ArgOperands,
4842                                                VPTransformState &State) {
4843   assert(!isa<DbgInfoIntrinsic>(I) &&
4844          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4845   setDebugLocFromInst(Builder, &I);
4846 
4847   Module *M = I.getParent()->getParent()->getParent();
4848   auto *CI = cast<CallInst>(&I);
4849 
4850   SmallVector<Type *, 4> Tys;
4851   for (Value *ArgOperand : CI->arg_operands())
4852     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4853 
4854   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4855 
4856   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4857   // version of the instruction.
4858   // Is it beneficial to perform intrinsic call compared to lib call?
4859   bool NeedToScalarize = false;
4860   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4861   bool UseVectorIntrinsic =
4862       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4863   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4864          "Instruction should be scalarized elsewhere.");
4865 
4866   for (unsigned Part = 0; Part < UF; ++Part) {
4867     SmallVector<Value *, 4> Args;
4868     for (auto &I : enumerate(ArgOperands.operands())) {
4869       // Some intrinsics have a scalar argument - don't replace it with a
4870       // vector.
4871       Value *Arg;
4872       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4873         Arg = State.get(I.value(), Part);
4874       else
4875         Arg = State.get(I.value(), {0, 0});
4876       Args.push_back(Arg);
4877     }
4878 
4879     Function *VectorF;
4880     if (UseVectorIntrinsic) {
4881       // Use vector version of the intrinsic.
4882       Type *TysForDecl[] = {CI->getType()};
4883       if (VF.isVector()) {
4884         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4885         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4886       }
4887       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4888       assert(VectorF && "Can't retrieve vector intrinsic.");
4889     } else {
4890       // Use vector version of the function call.
4891       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4892 #ifndef NDEBUG
4893       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4894              "Can't create vector function.");
4895 #endif
4896         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4897     }
4898       SmallVector<OperandBundleDef, 1> OpBundles;
4899       CI->getOperandBundlesAsDefs(OpBundles);
4900       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4901 
4902       if (isa<FPMathOperator>(V))
4903         V->copyFastMathFlags(CI);
4904 
4905       State.set(Def, &I, V, Part);
4906       addMetadata(V, &I);
4907   }
4908 }
4909 
4910 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4911                                                  VPUser &Operands,
4912                                                  bool InvariantCond,
4913                                                  VPTransformState &State) {
4914   setDebugLocFromInst(Builder, &I);
4915 
4916   // The condition can be loop invariant  but still defined inside the
4917   // loop. This means that we can't just use the original 'cond' value.
4918   // We have to take the 'vectorized' value and pick the first lane.
4919   // Instcombine will make this a no-op.
4920   auto *InvarCond =
4921       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4922 
4923   for (unsigned Part = 0; Part < UF; ++Part) {
4924     Value *Cond =
4925         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4926     Value *Op0 = State.get(Operands.getOperand(1), Part);
4927     Value *Op1 = State.get(Operands.getOperand(2), Part);
4928     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4929     State.set(VPDef, &I, Sel, Part);
4930     addMetadata(Sel, &I);
4931   }
4932 }
4933 
4934 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4935   // We should not collect Scalars more than once per VF. Right now, this
4936   // function is called from collectUniformsAndScalars(), which already does
4937   // this check. Collecting Scalars for VF=1 does not make any sense.
4938   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4939          "This function should not be visited twice for the same VF");
4940 
4941   SmallSetVector<Instruction *, 8> Worklist;
4942 
4943   // These sets are used to seed the analysis with pointers used by memory
4944   // accesses that will remain scalar.
4945   SmallSetVector<Instruction *, 8> ScalarPtrs;
4946   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4947   auto *Latch = TheLoop->getLoopLatch();
4948 
4949   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4950   // The pointer operands of loads and stores will be scalar as long as the
4951   // memory access is not a gather or scatter operation. The value operand of a
4952   // store will remain scalar if the store is scalarized.
4953   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4954     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4955     assert(WideningDecision != CM_Unknown &&
4956            "Widening decision should be ready at this moment");
4957     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4958       if (Ptr == Store->getValueOperand())
4959         return WideningDecision == CM_Scalarize;
4960     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4961            "Ptr is neither a value or pointer operand");
4962     return WideningDecision != CM_GatherScatter;
4963   };
4964 
4965   // A helper that returns true if the given value is a bitcast or
4966   // getelementptr instruction contained in the loop.
4967   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4968     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4969             isa<GetElementPtrInst>(V)) &&
4970            !TheLoop->isLoopInvariant(V);
4971   };
4972 
4973   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4974     if (!isa<PHINode>(Ptr) ||
4975         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4976       return false;
4977     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4978     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4979       return false;
4980     return isScalarUse(MemAccess, Ptr);
4981   };
4982 
4983   // A helper that evaluates a memory access's use of a pointer. If the
4984   // pointer is actually the pointer induction of a loop, it is being
4985   // inserted into Worklist. If the use will be a scalar use, and the
4986   // pointer is only used by memory accesses, we place the pointer in
4987   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4988   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4989     if (isScalarPtrInduction(MemAccess, Ptr)) {
4990       Worklist.insert(cast<Instruction>(Ptr));
4991       Instruction *Update = cast<Instruction>(
4992           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4993       Worklist.insert(Update);
4994       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4995                         << "\n");
4996       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4997                         << "\n");
4998       return;
4999     }
5000     // We only care about bitcast and getelementptr instructions contained in
5001     // the loop.
5002     if (!isLoopVaryingBitCastOrGEP(Ptr))
5003       return;
5004 
5005     // If the pointer has already been identified as scalar (e.g., if it was
5006     // also identified as uniform), there's nothing to do.
5007     auto *I = cast<Instruction>(Ptr);
5008     if (Worklist.count(I))
5009       return;
5010 
5011     // If the use of the pointer will be a scalar use, and all users of the
5012     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5013     // place the pointer in PossibleNonScalarPtrs.
5014     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5015           return isa<LoadInst>(U) || isa<StoreInst>(U);
5016         }))
5017       ScalarPtrs.insert(I);
5018     else
5019       PossibleNonScalarPtrs.insert(I);
5020   };
5021 
5022   // We seed the scalars analysis with three classes of instructions: (1)
5023   // instructions marked uniform-after-vectorization and (2) bitcast,
5024   // getelementptr and (pointer) phi instructions used by memory accesses
5025   // requiring a scalar use.
5026   //
5027   // (1) Add to the worklist all instructions that have been identified as
5028   // uniform-after-vectorization.
5029   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5030 
5031   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5032   // memory accesses requiring a scalar use. The pointer operands of loads and
5033   // stores will be scalar as long as the memory accesses is not a gather or
5034   // scatter operation. The value operand of a store will remain scalar if the
5035   // store is scalarized.
5036   for (auto *BB : TheLoop->blocks())
5037     for (auto &I : *BB) {
5038       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5039         evaluatePtrUse(Load, Load->getPointerOperand());
5040       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5041         evaluatePtrUse(Store, Store->getPointerOperand());
5042         evaluatePtrUse(Store, Store->getValueOperand());
5043       }
5044     }
5045   for (auto *I : ScalarPtrs)
5046     if (!PossibleNonScalarPtrs.count(I)) {
5047       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5048       Worklist.insert(I);
5049     }
5050 
5051   // Insert the forced scalars.
5052   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5053   // induction variable when the PHI user is scalarized.
5054   auto ForcedScalar = ForcedScalars.find(VF);
5055   if (ForcedScalar != ForcedScalars.end())
5056     for (auto *I : ForcedScalar->second)
5057       Worklist.insert(I);
5058 
5059   // Expand the worklist by looking through any bitcasts and getelementptr
5060   // instructions we've already identified as scalar. This is similar to the
5061   // expansion step in collectLoopUniforms(); however, here we're only
5062   // expanding to include additional bitcasts and getelementptr instructions.
5063   unsigned Idx = 0;
5064   while (Idx != Worklist.size()) {
5065     Instruction *Dst = Worklist[Idx++];
5066     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5067       continue;
5068     auto *Src = cast<Instruction>(Dst->getOperand(0));
5069     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5070           auto *J = cast<Instruction>(U);
5071           return !TheLoop->contains(J) || Worklist.count(J) ||
5072                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5073                   isScalarUse(J, Src));
5074         })) {
5075       Worklist.insert(Src);
5076       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5077     }
5078   }
5079 
5080   // An induction variable will remain scalar if all users of the induction
5081   // variable and induction variable update remain scalar.
5082   for (auto &Induction : Legal->getInductionVars()) {
5083     auto *Ind = Induction.first;
5084     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5085 
5086     // If tail-folding is applied, the primary induction variable will be used
5087     // to feed a vector compare.
5088     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5089       continue;
5090 
5091     // Determine if all users of the induction variable are scalar after
5092     // vectorization.
5093     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5094       auto *I = cast<Instruction>(U);
5095       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5096     });
5097     if (!ScalarInd)
5098       continue;
5099 
5100     // Determine if all users of the induction variable update instruction are
5101     // scalar after vectorization.
5102     auto ScalarIndUpdate =
5103         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5104           auto *I = cast<Instruction>(U);
5105           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5106         });
5107     if (!ScalarIndUpdate)
5108       continue;
5109 
5110     // The induction variable and its update instruction will remain scalar.
5111     Worklist.insert(Ind);
5112     Worklist.insert(IndUpdate);
5113     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5114     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5115                       << "\n");
5116   }
5117 
5118   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5119 }
5120 
5121 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
5122                                                          ElementCount VF) {
5123   if (!blockNeedsPredication(I->getParent()))
5124     return false;
5125   switch(I->getOpcode()) {
5126   default:
5127     break;
5128   case Instruction::Load:
5129   case Instruction::Store: {
5130     if (!Legal->isMaskRequired(I))
5131       return false;
5132     auto *Ptr = getLoadStorePointerOperand(I);
5133     auto *Ty = getMemInstValueType(I);
5134     // We have already decided how to vectorize this instruction, get that
5135     // result.
5136     if (VF.isVector()) {
5137       InstWidening WideningDecision = getWideningDecision(I, VF);
5138       assert(WideningDecision != CM_Unknown &&
5139              "Widening decision should be ready at this moment");
5140       return WideningDecision == CM_Scalarize;
5141     }
5142     const Align Alignment = getLoadStoreAlignment(I);
5143     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5144                                 isLegalMaskedGather(Ty, Alignment))
5145                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5146                                 isLegalMaskedScatter(Ty, Alignment));
5147   }
5148   case Instruction::UDiv:
5149   case Instruction::SDiv:
5150   case Instruction::SRem:
5151   case Instruction::URem:
5152     return mayDivideByZero(*I);
5153   }
5154   return false;
5155 }
5156 
5157 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5158     Instruction *I, ElementCount VF) {
5159   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5160   assert(getWideningDecision(I, VF) == CM_Unknown &&
5161          "Decision should not be set yet.");
5162   auto *Group = getInterleavedAccessGroup(I);
5163   assert(Group && "Must have a group.");
5164 
5165   // If the instruction's allocated size doesn't equal it's type size, it
5166   // requires padding and will be scalarized.
5167   auto &DL = I->getModule()->getDataLayout();
5168   auto *ScalarTy = getMemInstValueType(I);
5169   if (hasIrregularType(ScalarTy, DL, VF))
5170     return false;
5171 
5172   // Check if masking is required.
5173   // A Group may need masking for one of two reasons: it resides in a block that
5174   // needs predication, or it was decided to use masking to deal with gaps.
5175   bool PredicatedAccessRequiresMasking =
5176       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5177   bool AccessWithGapsRequiresMasking =
5178       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5179   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5180     return true;
5181 
5182   // If masked interleaving is required, we expect that the user/target had
5183   // enabled it, because otherwise it either wouldn't have been created or
5184   // it should have been invalidated by the CostModel.
5185   assert(useMaskedInterleavedAccesses(TTI) &&
5186          "Masked interleave-groups for predicated accesses are not enabled.");
5187 
5188   auto *Ty = getMemInstValueType(I);
5189   const Align Alignment = getLoadStoreAlignment(I);
5190   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5191                           : TTI.isLegalMaskedStore(Ty, Alignment);
5192 }
5193 
5194 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5195     Instruction *I, ElementCount VF) {
5196   // Get and ensure we have a valid memory instruction.
5197   LoadInst *LI = dyn_cast<LoadInst>(I);
5198   StoreInst *SI = dyn_cast<StoreInst>(I);
5199   assert((LI || SI) && "Invalid memory instruction");
5200 
5201   auto *Ptr = getLoadStorePointerOperand(I);
5202 
5203   // In order to be widened, the pointer should be consecutive, first of all.
5204   if (!Legal->isConsecutivePtr(Ptr))
5205     return false;
5206 
5207   // If the instruction is a store located in a predicated block, it will be
5208   // scalarized.
5209   if (isScalarWithPredication(I))
5210     return false;
5211 
5212   // If the instruction's allocated size doesn't equal it's type size, it
5213   // requires padding and will be scalarized.
5214   auto &DL = I->getModule()->getDataLayout();
5215   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5216   if (hasIrregularType(ScalarTy, DL, VF))
5217     return false;
5218 
5219   return true;
5220 }
5221 
5222 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5223   // We should not collect Uniforms more than once per VF. Right now,
5224   // this function is called from collectUniformsAndScalars(), which
5225   // already does this check. Collecting Uniforms for VF=1 does not make any
5226   // sense.
5227 
5228   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5229          "This function should not be visited twice for the same VF");
5230 
5231   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5232   // not analyze again.  Uniforms.count(VF) will return 1.
5233   Uniforms[VF].clear();
5234 
5235   // We now know that the loop is vectorizable!
5236   // Collect instructions inside the loop that will remain uniform after
5237   // vectorization.
5238 
5239   // Global values, params and instructions outside of current loop are out of
5240   // scope.
5241   auto isOutOfScope = [&](Value *V) -> bool {
5242     Instruction *I = dyn_cast<Instruction>(V);
5243     return (!I || !TheLoop->contains(I));
5244   };
5245 
5246   SetVector<Instruction *> Worklist;
5247   BasicBlock *Latch = TheLoop->getLoopLatch();
5248 
5249   // Instructions that are scalar with predication must not be considered
5250   // uniform after vectorization, because that would create an erroneous
5251   // replicating region where only a single instance out of VF should be formed.
5252   // TODO: optimize such seldom cases if found important, see PR40816.
5253   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5254     if (isOutOfScope(I)) {
5255       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5256                         << *I << "\n");
5257       return;
5258     }
5259     if (isScalarWithPredication(I, VF)) {
5260       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5261                         << *I << "\n");
5262       return;
5263     }
5264     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5265     Worklist.insert(I);
5266   };
5267 
5268   // Start with the conditional branch. If the branch condition is an
5269   // instruction contained in the loop that is only used by the branch, it is
5270   // uniform.
5271   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5272   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5273     addToWorklistIfAllowed(Cmp);
5274 
5275   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5276     InstWidening WideningDecision = getWideningDecision(I, VF);
5277     assert(WideningDecision != CM_Unknown &&
5278            "Widening decision should be ready at this moment");
5279 
5280     // A uniform memory op is itself uniform.  We exclude uniform stores
5281     // here as they demand the last lane, not the first one.
5282     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5283       assert(WideningDecision == CM_Scalarize);
5284       return true;
5285     }
5286 
5287     return (WideningDecision == CM_Widen ||
5288             WideningDecision == CM_Widen_Reverse ||
5289             WideningDecision == CM_Interleave);
5290   };
5291 
5292 
5293   // Returns true if Ptr is the pointer operand of a memory access instruction
5294   // I, and I is known to not require scalarization.
5295   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5296     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5297   };
5298 
5299   // Holds a list of values which are known to have at least one uniform use.
5300   // Note that there may be other uses which aren't uniform.  A "uniform use"
5301   // here is something which only demands lane 0 of the unrolled iterations;
5302   // it does not imply that all lanes produce the same value (e.g. this is not
5303   // the usual meaning of uniform)
5304   SmallPtrSet<Value *, 8> HasUniformUse;
5305 
5306   // Scan the loop for instructions which are either a) known to have only
5307   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5308   for (auto *BB : TheLoop->blocks())
5309     for (auto &I : *BB) {
5310       // If there's no pointer operand, there's nothing to do.
5311       auto *Ptr = getLoadStorePointerOperand(&I);
5312       if (!Ptr)
5313         continue;
5314 
5315       // A uniform memory op is itself uniform.  We exclude uniform stores
5316       // here as they demand the last lane, not the first one.
5317       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5318         addToWorklistIfAllowed(&I);
5319 
5320       if (isUniformDecision(&I, VF)) {
5321         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5322         HasUniformUse.insert(Ptr);
5323       }
5324     }
5325 
5326   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5327   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5328   // disallows uses outside the loop as well.
5329   for (auto *V : HasUniformUse) {
5330     if (isOutOfScope(V))
5331       continue;
5332     auto *I = cast<Instruction>(V);
5333     auto UsersAreMemAccesses =
5334       llvm::all_of(I->users(), [&](User *U) -> bool {
5335         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5336       });
5337     if (UsersAreMemAccesses)
5338       addToWorklistIfAllowed(I);
5339   }
5340 
5341   // Expand Worklist in topological order: whenever a new instruction
5342   // is added , its users should be already inside Worklist.  It ensures
5343   // a uniform instruction will only be used by uniform instructions.
5344   unsigned idx = 0;
5345   while (idx != Worklist.size()) {
5346     Instruction *I = Worklist[idx++];
5347 
5348     for (auto OV : I->operand_values()) {
5349       // isOutOfScope operands cannot be uniform instructions.
5350       if (isOutOfScope(OV))
5351         continue;
5352       // First order recurrence Phi's should typically be considered
5353       // non-uniform.
5354       auto *OP = dyn_cast<PHINode>(OV);
5355       if (OP && Legal->isFirstOrderRecurrence(OP))
5356         continue;
5357       // If all the users of the operand are uniform, then add the
5358       // operand into the uniform worklist.
5359       auto *OI = cast<Instruction>(OV);
5360       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5361             auto *J = cast<Instruction>(U);
5362             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5363           }))
5364         addToWorklistIfAllowed(OI);
5365     }
5366   }
5367 
5368   // For an instruction to be added into Worklist above, all its users inside
5369   // the loop should also be in Worklist. However, this condition cannot be
5370   // true for phi nodes that form a cyclic dependence. We must process phi
5371   // nodes separately. An induction variable will remain uniform if all users
5372   // of the induction variable and induction variable update remain uniform.
5373   // The code below handles both pointer and non-pointer induction variables.
5374   for (auto &Induction : Legal->getInductionVars()) {
5375     auto *Ind = Induction.first;
5376     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5377 
5378     // Determine if all users of the induction variable are uniform after
5379     // vectorization.
5380     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5381       auto *I = cast<Instruction>(U);
5382       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5383              isVectorizedMemAccessUse(I, Ind);
5384     });
5385     if (!UniformInd)
5386       continue;
5387 
5388     // Determine if all users of the induction variable update instruction are
5389     // uniform after vectorization.
5390     auto UniformIndUpdate =
5391         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5392           auto *I = cast<Instruction>(U);
5393           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5394                  isVectorizedMemAccessUse(I, IndUpdate);
5395         });
5396     if (!UniformIndUpdate)
5397       continue;
5398 
5399     // The induction variable and its update instruction will remain uniform.
5400     addToWorklistIfAllowed(Ind);
5401     addToWorklistIfAllowed(IndUpdate);
5402   }
5403 
5404   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5405 }
5406 
5407 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5408   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5409 
5410   if (Legal->getRuntimePointerChecking()->Need) {
5411     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5412         "runtime pointer checks needed. Enable vectorization of this "
5413         "loop with '#pragma clang loop vectorize(enable)' when "
5414         "compiling with -Os/-Oz",
5415         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5416     return true;
5417   }
5418 
5419   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5420     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5421         "runtime SCEV checks needed. Enable vectorization of this "
5422         "loop with '#pragma clang loop vectorize(enable)' when "
5423         "compiling with -Os/-Oz",
5424         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5425     return true;
5426   }
5427 
5428   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5429   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5430     reportVectorizationFailure("Runtime stride check for small trip count",
5431         "runtime stride == 1 checks needed. Enable vectorization of "
5432         "this loop without such check by compiling with -Os/-Oz",
5433         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5434     return true;
5435   }
5436 
5437   return false;
5438 }
5439 
5440 Optional<ElementCount>
5441 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5442   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5443     // TODO: It may by useful to do since it's still likely to be dynamically
5444     // uniform if the target can skip.
5445     reportVectorizationFailure(
5446         "Not inserting runtime ptr check for divergent target",
5447         "runtime pointer checks needed. Not enabled for divergent target",
5448         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5449     return None;
5450   }
5451 
5452   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5453   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5454   if (TC == 1) {
5455     reportVectorizationFailure("Single iteration (non) loop",
5456         "loop trip count is one, irrelevant for vectorization",
5457         "SingleIterationLoop", ORE, TheLoop);
5458     return None;
5459   }
5460 
5461   ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5462 
5463   switch (ScalarEpilogueStatus) {
5464   case CM_ScalarEpilogueAllowed:
5465     return MaxVF;
5466   case CM_ScalarEpilogueNotAllowedUsePredicate:
5467     LLVM_FALLTHROUGH;
5468   case CM_ScalarEpilogueNotNeededUsePredicate:
5469     LLVM_DEBUG(
5470         dbgs() << "LV: vector predicate hint/switch found.\n"
5471                << "LV: Not allowing scalar epilogue, creating predicated "
5472                << "vector loop.\n");
5473     break;
5474   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5475     // fallthrough as a special case of OptForSize
5476   case CM_ScalarEpilogueNotAllowedOptSize:
5477     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5478       LLVM_DEBUG(
5479           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5480     else
5481       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5482                         << "count.\n");
5483 
5484     // Bail if runtime checks are required, which are not good when optimising
5485     // for size.
5486     if (runtimeChecksRequired())
5487       return None;
5488     break;
5489   }
5490 
5491   // Now try the tail folding
5492 
5493   // Invalidate interleave groups that require an epilogue if we can't mask
5494   // the interleave-group.
5495   if (!useMaskedInterleavedAccesses(TTI)) {
5496     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5497            "No decisions should have been taken at this point");
5498     // Note: There is no need to invalidate any cost modeling decisions here, as
5499     // non where taken so far.
5500     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5501   }
5502 
5503   assert(!MaxVF.isScalable() &&
5504          "Scalable vectors do not yet support tail folding");
5505   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5506          "MaxVF must be a power of 2");
5507   unsigned MaxVFtimesIC =
5508       UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5509   // Avoid tail folding if the trip count is known to be a multiple of any VF we
5510   // chose.
5511   ScalarEvolution *SE = PSE.getSE();
5512   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5513   const SCEV *ExitCount = SE->getAddExpr(
5514       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5515   const SCEV *Rem = SE->getURemExpr(
5516       ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5517   if (Rem->isZero()) {
5518     // Accept MaxVF if we do not have a tail.
5519     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5520     return MaxVF;
5521   }
5522 
5523   // If we don't know the precise trip count, or if the trip count that we
5524   // found modulo the vectorization factor is not zero, try to fold the tail
5525   // by masking.
5526   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5527   if (Legal->prepareToFoldTailByMasking()) {
5528     FoldTailByMasking = true;
5529     return MaxVF;
5530   }
5531 
5532   // If there was a tail-folding hint/switch, but we can't fold the tail by
5533   // masking, fallback to a vectorization with a scalar epilogue.
5534   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5535     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5536                          "scalar epilogue instead.\n");
5537     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5538     return MaxVF;
5539   }
5540 
5541   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5542     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5543     return None;
5544   }
5545 
5546   if (TC == 0) {
5547     reportVectorizationFailure(
5548         "Unable to calculate the loop count due to complex control flow",
5549         "unable to calculate the loop count due to complex control flow",
5550         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5551     return None;
5552   }
5553 
5554   reportVectorizationFailure(
5555       "Cannot optimize for size and vectorize at the same time.",
5556       "cannot optimize for size and vectorize at the same time. "
5557       "Enable vectorization of this loop with '#pragma clang loop "
5558       "vectorize(enable)' when compiling with -Os/-Oz",
5559       "NoTailLoopWithOptForSize", ORE, TheLoop);
5560   return None;
5561 }
5562 
5563 ElementCount
5564 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5565                                                  ElementCount UserVF) {
5566   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5567   unsigned SmallestType, WidestType;
5568   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5569   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5570 
5571   // Get the maximum safe dependence distance in bits computed by LAA.
5572   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5573   // the memory accesses that is most restrictive (involved in the smallest
5574   // dependence distance).
5575   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5576 
5577   if (UserVF.isNonZero()) {
5578     // For now, don't verify legality of scalable vectors.
5579     // This will be addressed properly in https://reviews.llvm.org/D91718.
5580     if (UserVF.isScalable())
5581       return UserVF;
5582 
5583     // If legally unsafe, clamp the user vectorization factor to a safe value.
5584     unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5585     if (UserVF.getFixedValue() <= MaxSafeVF)
5586       return UserVF;
5587 
5588     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5589                       << " is unsafe, clamping to max safe VF=" << MaxSafeVF
5590                       << ".\n");
5591     ORE->emit([&]() {
5592       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5593                                         TheLoop->getStartLoc(),
5594                                         TheLoop->getHeader())
5595              << "User-specified vectorization factor "
5596              << ore::NV("UserVectorizationFactor", UserVF)
5597              << " is unsafe, clamping to maximum safe vectorization factor "
5598              << ore::NV("VectorizationFactor", MaxSafeVF);
5599     });
5600     return ElementCount::getFixed(MaxSafeVF);
5601   }
5602 
5603   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5604 
5605   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5606   // Note that both WidestRegister and WidestType may not be a powers of 2.
5607   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5608 
5609   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5610                     << " / " << WidestType << " bits.\n");
5611   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5612                     << WidestRegister << " bits.\n");
5613 
5614   assert(MaxVectorSize <= WidestRegister &&
5615          "Did not expect to pack so many elements"
5616          " into one vector!");
5617   if (MaxVectorSize == 0) {
5618     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5619     MaxVectorSize = 1;
5620     return ElementCount::getFixed(MaxVectorSize);
5621   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5622              isPowerOf2_32(ConstTripCount)) {
5623     // We need to clamp the VF to be the ConstTripCount. There is no point in
5624     // choosing a higher viable VF as done in the loop below.
5625     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5626                       << ConstTripCount << "\n");
5627     MaxVectorSize = ConstTripCount;
5628     return ElementCount::getFixed(MaxVectorSize);
5629   }
5630 
5631   unsigned MaxVF = MaxVectorSize;
5632   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5633       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5634     // Collect all viable vectorization factors larger than the default MaxVF
5635     // (i.e. MaxVectorSize).
5636     SmallVector<ElementCount, 8> VFs;
5637     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5638     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5639       VFs.push_back(ElementCount::getFixed(VS));
5640 
5641     // For each VF calculate its register usage.
5642     auto RUs = calculateRegisterUsage(VFs);
5643 
5644     // Select the largest VF which doesn't require more registers than existing
5645     // ones.
5646     for (int i = RUs.size() - 1; i >= 0; --i) {
5647       bool Selected = true;
5648       for (auto& pair : RUs[i].MaxLocalUsers) {
5649         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5650         if (pair.second > TargetNumRegisters)
5651           Selected = false;
5652       }
5653       if (Selected) {
5654         MaxVF = VFs[i].getKnownMinValue();
5655         break;
5656       }
5657     }
5658     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5659       if (MaxVF < MinVF) {
5660         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5661                           << ") with target's minimum: " << MinVF << '\n');
5662         MaxVF = MinVF;
5663       }
5664     }
5665   }
5666   return ElementCount::getFixed(MaxVF);
5667 }
5668 
5669 VectorizationFactor
5670 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5671   // FIXME: This can be fixed for scalable vectors later, because at this stage
5672   // the LoopVectorizer will only consider vectorizing a loop with scalable
5673   // vectors when the loop has a hint to enable vectorization for a given VF.
5674   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5675 
5676   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5677   const float ScalarCost = Cost;
5678   unsigned Width = 1;
5679   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5680 
5681   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5682   if (ForceVectorization && MaxVF.isVector()) {
5683     // Ignore scalar width, because the user explicitly wants vectorization.
5684     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5685     // evaluation.
5686     Cost = std::numeric_limits<float>::max();
5687   }
5688 
5689   for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
5690     // Notice that the vector loop needs to be executed less times, so
5691     // we need to divide the cost of the vector loops by the width of
5692     // the vector elements.
5693     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5694     float VectorCost = C.first / (float)i;
5695     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5696                       << " costs: " << (int)VectorCost << ".\n");
5697     if (!C.second && !ForceVectorization) {
5698       LLVM_DEBUG(
5699           dbgs() << "LV: Not considering vector loop of width " << i
5700                  << " because it will not generate any vector instructions.\n");
5701       continue;
5702     }
5703 
5704     // If profitable add it to ProfitableVF list.
5705     if (VectorCost < ScalarCost) {
5706       ProfitableVFs.push_back(VectorizationFactor(
5707           {ElementCount::getFixed(i), (unsigned)VectorCost}));
5708     }
5709 
5710     if (VectorCost < Cost) {
5711       Cost = VectorCost;
5712       Width = i;
5713     }
5714   }
5715 
5716   if (!EnableCondStoresVectorization && NumPredStores) {
5717     reportVectorizationFailure("There are conditional stores.",
5718         "store that is conditionally executed prevents vectorization",
5719         "ConditionalStore", ORE, TheLoop);
5720     Width = 1;
5721     Cost = ScalarCost;
5722   }
5723 
5724   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5725              << "LV: Vectorization seems to be not beneficial, "
5726              << "but was forced by a user.\n");
5727   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5728   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5729                                 (unsigned)(Width * Cost)};
5730   return Factor;
5731 }
5732 
5733 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5734     const Loop &L, ElementCount VF) const {
5735   // Cross iteration phis such as reductions need special handling and are
5736   // currently unsupported.
5737   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5738         return Legal->isFirstOrderRecurrence(&Phi) ||
5739                Legal->isReductionVariable(&Phi);
5740       }))
5741     return false;
5742 
5743   // Phis with uses outside of the loop require special handling and are
5744   // currently unsupported.
5745   for (auto &Entry : Legal->getInductionVars()) {
5746     // Look for uses of the value of the induction at the last iteration.
5747     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5748     for (User *U : PostInc->users())
5749       if (!L.contains(cast<Instruction>(U)))
5750         return false;
5751     // Look for uses of penultimate value of the induction.
5752     for (User *U : Entry.first->users())
5753       if (!L.contains(cast<Instruction>(U)))
5754         return false;
5755   }
5756 
5757   // Induction variables that are widened require special handling that is
5758   // currently not supported.
5759   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5760         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5761                  this->isProfitableToScalarize(Entry.first, VF));
5762       }))
5763     return false;
5764 
5765   return true;
5766 }
5767 
5768 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5769     const ElementCount VF) const {
5770   // FIXME: We need a much better cost-model to take different parameters such
5771   // as register pressure, code size increase and cost of extra branches into
5772   // account. For now we apply a very crude heuristic and only consider loops
5773   // with vectorization factors larger than a certain value.
5774   // We also consider epilogue vectorization unprofitable for targets that don't
5775   // consider interleaving beneficial (eg. MVE).
5776   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5777     return false;
5778   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5779     return true;
5780   return false;
5781 }
5782 
5783 VectorizationFactor
5784 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5785     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5786   VectorizationFactor Result = VectorizationFactor::Disabled();
5787   if (!EnableEpilogueVectorization) {
5788     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5789     return Result;
5790   }
5791 
5792   if (!isScalarEpilogueAllowed()) {
5793     LLVM_DEBUG(
5794         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5795                   "allowed.\n";);
5796     return Result;
5797   }
5798 
5799   // FIXME: This can be fixed for scalable vectors later, because at this stage
5800   // the LoopVectorizer will only consider vectorizing a loop with scalable
5801   // vectors when the loop has a hint to enable vectorization for a given VF.
5802   if (MainLoopVF.isScalable()) {
5803     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
5804                          "yet supported.\n");
5805     return Result;
5806   }
5807 
5808   // Not really a cost consideration, but check for unsupported cases here to
5809   // simplify the logic.
5810   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5811     LLVM_DEBUG(
5812         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5813                   "not a supported candidate.\n";);
5814     return Result;
5815   }
5816 
5817   if (EpilogueVectorizationForceVF > 1) {
5818     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5819     if (LVP.hasPlanWithVFs(
5820             {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
5821       return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
5822     else {
5823       LLVM_DEBUG(
5824           dbgs()
5825               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5826       return Result;
5827     }
5828   }
5829 
5830   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5831       TheLoop->getHeader()->getParent()->hasMinSize()) {
5832     LLVM_DEBUG(
5833         dbgs()
5834             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5835     return Result;
5836   }
5837 
5838   if (!isEpilogueVectorizationProfitable(MainLoopVF))
5839     return Result;
5840 
5841   for (auto &NextVF : ProfitableVFs)
5842     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
5843         (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
5844         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
5845       Result = NextVF;
5846 
5847   if (Result != VectorizationFactor::Disabled())
5848     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5849                       << Result.Width.getFixedValue() << "\n";);
5850   return Result;
5851 }
5852 
5853 std::pair<unsigned, unsigned>
5854 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5855   unsigned MinWidth = -1U;
5856   unsigned MaxWidth = 8;
5857   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5858 
5859   // For each block.
5860   for (BasicBlock *BB : TheLoop->blocks()) {
5861     // For each instruction in the loop.
5862     for (Instruction &I : BB->instructionsWithoutDebug()) {
5863       Type *T = I.getType();
5864 
5865       // Skip ignored values.
5866       if (ValuesToIgnore.count(&I))
5867         continue;
5868 
5869       // Only examine Loads, Stores and PHINodes.
5870       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5871         continue;
5872 
5873       // Examine PHI nodes that are reduction variables. Update the type to
5874       // account for the recurrence type.
5875       if (auto *PN = dyn_cast<PHINode>(&I)) {
5876         if (!Legal->isReductionVariable(PN))
5877           continue;
5878         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5879         T = RdxDesc.getRecurrenceType();
5880       }
5881 
5882       // Examine the stored values.
5883       if (auto *ST = dyn_cast<StoreInst>(&I))
5884         T = ST->getValueOperand()->getType();
5885 
5886       // Ignore loaded pointer types and stored pointer types that are not
5887       // vectorizable.
5888       //
5889       // FIXME: The check here attempts to predict whether a load or store will
5890       //        be vectorized. We only know this for certain after a VF has
5891       //        been selected. Here, we assume that if an access can be
5892       //        vectorized, it will be. We should also look at extending this
5893       //        optimization to non-pointer types.
5894       //
5895       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5896           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5897         continue;
5898 
5899       MinWidth = std::min(MinWidth,
5900                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5901       MaxWidth = std::max(MaxWidth,
5902                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5903     }
5904   }
5905 
5906   return {MinWidth, MaxWidth};
5907 }
5908 
5909 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5910                                                            unsigned LoopCost) {
5911   // -- The interleave heuristics --
5912   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5913   // There are many micro-architectural considerations that we can't predict
5914   // at this level. For example, frontend pressure (on decode or fetch) due to
5915   // code size, or the number and capabilities of the execution ports.
5916   //
5917   // We use the following heuristics to select the interleave count:
5918   // 1. If the code has reductions, then we interleave to break the cross
5919   // iteration dependency.
5920   // 2. If the loop is really small, then we interleave to reduce the loop
5921   // overhead.
5922   // 3. We don't interleave if we think that we will spill registers to memory
5923   // due to the increased register pressure.
5924 
5925   if (!isScalarEpilogueAllowed())
5926     return 1;
5927 
5928   // We used the distance for the interleave count.
5929   if (Legal->getMaxSafeDepDistBytes() != -1U)
5930     return 1;
5931 
5932   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5933   const bool HasReductions = !Legal->getReductionVars().empty();
5934   // Do not interleave loops with a relatively small known or estimated trip
5935   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5936   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5937   // because with the above conditions interleaving can expose ILP and break
5938   // cross iteration dependences for reductions.
5939   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5940       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5941     return 1;
5942 
5943   RegisterUsage R = calculateRegisterUsage({VF})[0];
5944   // We divide by these constants so assume that we have at least one
5945   // instruction that uses at least one register.
5946   for (auto& pair : R.MaxLocalUsers) {
5947     pair.second = std::max(pair.second, 1U);
5948   }
5949 
5950   // We calculate the interleave count using the following formula.
5951   // Subtract the number of loop invariants from the number of available
5952   // registers. These registers are used by all of the interleaved instances.
5953   // Next, divide the remaining registers by the number of registers that is
5954   // required by the loop, in order to estimate how many parallel instances
5955   // fit without causing spills. All of this is rounded down if necessary to be
5956   // a power of two. We want power of two interleave count to simplify any
5957   // addressing operations or alignment considerations.
5958   // We also want power of two interleave counts to ensure that the induction
5959   // variable of the vector loop wraps to zero, when tail is folded by masking;
5960   // this currently happens when OptForSize, in which case IC is set to 1 above.
5961   unsigned IC = UINT_MAX;
5962 
5963   for (auto& pair : R.MaxLocalUsers) {
5964     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5965     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5966                       << " registers of "
5967                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5968     if (VF.isScalar()) {
5969       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5970         TargetNumRegisters = ForceTargetNumScalarRegs;
5971     } else {
5972       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5973         TargetNumRegisters = ForceTargetNumVectorRegs;
5974     }
5975     unsigned MaxLocalUsers = pair.second;
5976     unsigned LoopInvariantRegs = 0;
5977     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5978       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5979 
5980     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5981     // Don't count the induction variable as interleaved.
5982     if (EnableIndVarRegisterHeur) {
5983       TmpIC =
5984           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5985                         std::max(1U, (MaxLocalUsers - 1)));
5986     }
5987 
5988     IC = std::min(IC, TmpIC);
5989   }
5990 
5991   // Clamp the interleave ranges to reasonable counts.
5992   unsigned MaxInterleaveCount =
5993       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5994 
5995   // Check if the user has overridden the max.
5996   if (VF.isScalar()) {
5997     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5998       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5999   } else {
6000     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6001       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6002   }
6003 
6004   // If trip count is known or estimated compile time constant, limit the
6005   // interleave count to be less than the trip count divided by VF, provided it
6006   // is at least 1.
6007   //
6008   // For scalable vectors we can't know if interleaving is beneficial. It may
6009   // not be beneficial for small loops if none of the lanes in the second vector
6010   // iterations is enabled. However, for larger loops, there is likely to be a
6011   // similar benefit as for fixed-width vectors. For now, we choose to leave
6012   // the InterleaveCount as if vscale is '1', although if some information about
6013   // the vector is known (e.g. min vector size), we can make a better decision.
6014   if (BestKnownTC) {
6015     MaxInterleaveCount =
6016         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6017     // Make sure MaxInterleaveCount is greater than 0.
6018     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6019   }
6020 
6021   assert(MaxInterleaveCount > 0 &&
6022          "Maximum interleave count must be greater than 0");
6023 
6024   // Clamp the calculated IC to be between the 1 and the max interleave count
6025   // that the target and trip count allows.
6026   if (IC > MaxInterleaveCount)
6027     IC = MaxInterleaveCount;
6028   else
6029     // Make sure IC is greater than 0.
6030     IC = std::max(1u, IC);
6031 
6032   assert(IC > 0 && "Interleave count must be greater than 0.");
6033 
6034   // If we did not calculate the cost for VF (because the user selected the VF)
6035   // then we calculate the cost of VF here.
6036   if (LoopCost == 0)
6037     LoopCost = expectedCost(VF).first;
6038 
6039   assert(LoopCost && "Non-zero loop cost expected");
6040 
6041   // Interleave if we vectorized this loop and there is a reduction that could
6042   // benefit from interleaving.
6043   if (VF.isVector() && HasReductions) {
6044     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6045     return IC;
6046   }
6047 
6048   // Note that if we've already vectorized the loop we will have done the
6049   // runtime check and so interleaving won't require further checks.
6050   bool InterleavingRequiresRuntimePointerCheck =
6051       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6052 
6053   // We want to interleave small loops in order to reduce the loop overhead and
6054   // potentially expose ILP opportunities.
6055   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6056                     << "LV: IC is " << IC << '\n'
6057                     << "LV: VF is " << VF << '\n');
6058   const bool AggressivelyInterleaveReductions =
6059       TTI.enableAggressiveInterleaving(HasReductions);
6060   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6061     // We assume that the cost overhead is 1 and we use the cost model
6062     // to estimate the cost of the loop and interleave until the cost of the
6063     // loop overhead is about 5% of the cost of the loop.
6064     unsigned SmallIC =
6065         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6066 
6067     // Interleave until store/load ports (estimated by max interleave count) are
6068     // saturated.
6069     unsigned NumStores = Legal->getNumStores();
6070     unsigned NumLoads = Legal->getNumLoads();
6071     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6072     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6073 
6074     // If we have a scalar reduction (vector reductions are already dealt with
6075     // by this point), we can increase the critical path length if the loop
6076     // we're interleaving is inside another loop. Limit, by default to 2, so the
6077     // critical path only gets increased by one reduction operation.
6078     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6079       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6080       SmallIC = std::min(SmallIC, F);
6081       StoresIC = std::min(StoresIC, F);
6082       LoadsIC = std::min(LoadsIC, F);
6083     }
6084 
6085     if (EnableLoadStoreRuntimeInterleave &&
6086         std::max(StoresIC, LoadsIC) > SmallIC) {
6087       LLVM_DEBUG(
6088           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6089       return std::max(StoresIC, LoadsIC);
6090     }
6091 
6092     // If there are scalar reductions and TTI has enabled aggressive
6093     // interleaving for reductions, we will interleave to expose ILP.
6094     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6095         AggressivelyInterleaveReductions) {
6096       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6097       // Interleave no less than SmallIC but not as aggressive as the normal IC
6098       // to satisfy the rare situation when resources are too limited.
6099       return std::max(IC / 2, SmallIC);
6100     } else {
6101       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6102       return SmallIC;
6103     }
6104   }
6105 
6106   // Interleave if this is a large loop (small loops are already dealt with by
6107   // this point) that could benefit from interleaving.
6108   if (AggressivelyInterleaveReductions) {
6109     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6110     return IC;
6111   }
6112 
6113   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6114   return 1;
6115 }
6116 
6117 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6118 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6119   // This function calculates the register usage by measuring the highest number
6120   // of values that are alive at a single location. Obviously, this is a very
6121   // rough estimation. We scan the loop in a topological order in order and
6122   // assign a number to each instruction. We use RPO to ensure that defs are
6123   // met before their users. We assume that each instruction that has in-loop
6124   // users starts an interval. We record every time that an in-loop value is
6125   // used, so we have a list of the first and last occurrences of each
6126   // instruction. Next, we transpose this data structure into a multi map that
6127   // holds the list of intervals that *end* at a specific location. This multi
6128   // map allows us to perform a linear search. We scan the instructions linearly
6129   // and record each time that a new interval starts, by placing it in a set.
6130   // If we find this value in the multi-map then we remove it from the set.
6131   // The max register usage is the maximum size of the set.
6132   // We also search for instructions that are defined outside the loop, but are
6133   // used inside the loop. We need this number separately from the max-interval
6134   // usage number because when we unroll, loop-invariant values do not take
6135   // more register.
6136   LoopBlocksDFS DFS(TheLoop);
6137   DFS.perform(LI);
6138 
6139   RegisterUsage RU;
6140 
6141   // Each 'key' in the map opens a new interval. The values
6142   // of the map are the index of the 'last seen' usage of the
6143   // instruction that is the key.
6144   using IntervalMap = DenseMap<Instruction *, unsigned>;
6145 
6146   // Maps instruction to its index.
6147   SmallVector<Instruction *, 64> IdxToInstr;
6148   // Marks the end of each interval.
6149   IntervalMap EndPoint;
6150   // Saves the list of instruction indices that are used in the loop.
6151   SmallPtrSet<Instruction *, 8> Ends;
6152   // Saves the list of values that are used in the loop but are
6153   // defined outside the loop, such as arguments and constants.
6154   SmallPtrSet<Value *, 8> LoopInvariants;
6155 
6156   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6157     for (Instruction &I : BB->instructionsWithoutDebug()) {
6158       IdxToInstr.push_back(&I);
6159 
6160       // Save the end location of each USE.
6161       for (Value *U : I.operands()) {
6162         auto *Instr = dyn_cast<Instruction>(U);
6163 
6164         // Ignore non-instruction values such as arguments, constants, etc.
6165         if (!Instr)
6166           continue;
6167 
6168         // If this instruction is outside the loop then record it and continue.
6169         if (!TheLoop->contains(Instr)) {
6170           LoopInvariants.insert(Instr);
6171           continue;
6172         }
6173 
6174         // Overwrite previous end points.
6175         EndPoint[Instr] = IdxToInstr.size();
6176         Ends.insert(Instr);
6177       }
6178     }
6179   }
6180 
6181   // Saves the list of intervals that end with the index in 'key'.
6182   using InstrList = SmallVector<Instruction *, 2>;
6183   DenseMap<unsigned, InstrList> TransposeEnds;
6184 
6185   // Transpose the EndPoints to a list of values that end at each index.
6186   for (auto &Interval : EndPoint)
6187     TransposeEnds[Interval.second].push_back(Interval.first);
6188 
6189   SmallPtrSet<Instruction *, 8> OpenIntervals;
6190   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6191   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6192 
6193   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6194 
6195   // A lambda that gets the register usage for the given type and VF.
6196   const auto &TTICapture = TTI;
6197   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6198     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6199       return 0U;
6200     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6201   };
6202 
6203   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6204     Instruction *I = IdxToInstr[i];
6205 
6206     // Remove all of the instructions that end at this location.
6207     InstrList &List = TransposeEnds[i];
6208     for (Instruction *ToRemove : List)
6209       OpenIntervals.erase(ToRemove);
6210 
6211     // Ignore instructions that are never used within the loop.
6212     if (!Ends.count(I))
6213       continue;
6214 
6215     // Skip ignored values.
6216     if (ValuesToIgnore.count(I))
6217       continue;
6218 
6219     // For each VF find the maximum usage of registers.
6220     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6221       // Count the number of live intervals.
6222       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6223 
6224       if (VFs[j].isScalar()) {
6225         for (auto Inst : OpenIntervals) {
6226           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6227           if (RegUsage.find(ClassID) == RegUsage.end())
6228             RegUsage[ClassID] = 1;
6229           else
6230             RegUsage[ClassID] += 1;
6231         }
6232       } else {
6233         collectUniformsAndScalars(VFs[j]);
6234         for (auto Inst : OpenIntervals) {
6235           // Skip ignored values for VF > 1.
6236           if (VecValuesToIgnore.count(Inst))
6237             continue;
6238           if (isScalarAfterVectorization(Inst, VFs[j])) {
6239             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6240             if (RegUsage.find(ClassID) == RegUsage.end())
6241               RegUsage[ClassID] = 1;
6242             else
6243               RegUsage[ClassID] += 1;
6244           } else {
6245             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6246             if (RegUsage.find(ClassID) == RegUsage.end())
6247               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6248             else
6249               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6250           }
6251         }
6252       }
6253 
6254       for (auto& pair : RegUsage) {
6255         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6256           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6257         else
6258           MaxUsages[j][pair.first] = pair.second;
6259       }
6260     }
6261 
6262     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6263                       << OpenIntervals.size() << '\n');
6264 
6265     // Add the current instruction to the list of open intervals.
6266     OpenIntervals.insert(I);
6267   }
6268 
6269   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6270     SmallMapVector<unsigned, unsigned, 4> Invariant;
6271 
6272     for (auto Inst : LoopInvariants) {
6273       unsigned Usage =
6274           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6275       unsigned ClassID =
6276           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6277       if (Invariant.find(ClassID) == Invariant.end())
6278         Invariant[ClassID] = Usage;
6279       else
6280         Invariant[ClassID] += Usage;
6281     }
6282 
6283     LLVM_DEBUG({
6284       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6285       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6286              << " item\n";
6287       for (const auto &pair : MaxUsages[i]) {
6288         dbgs() << "LV(REG): RegisterClass: "
6289                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6290                << " registers\n";
6291       }
6292       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6293              << " item\n";
6294       for (const auto &pair : Invariant) {
6295         dbgs() << "LV(REG): RegisterClass: "
6296                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6297                << " registers\n";
6298       }
6299     });
6300 
6301     RU.LoopInvariantRegs = Invariant;
6302     RU.MaxLocalUsers = MaxUsages[i];
6303     RUs[i] = RU;
6304   }
6305 
6306   return RUs;
6307 }
6308 
6309 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6310   // TODO: Cost model for emulated masked load/store is completely
6311   // broken. This hack guides the cost model to use an artificially
6312   // high enough value to practically disable vectorization with such
6313   // operations, except where previously deployed legality hack allowed
6314   // using very low cost values. This is to avoid regressions coming simply
6315   // from moving "masked load/store" check from legality to cost model.
6316   // Masked Load/Gather emulation was previously never allowed.
6317   // Limited number of Masked Store/Scatter emulation was allowed.
6318   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
6319   return isa<LoadInst>(I) ||
6320          (isa<StoreInst>(I) &&
6321           NumPredStores > NumberOfStoresToPredicate);
6322 }
6323 
6324 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6325   // If we aren't vectorizing the loop, or if we've already collected the
6326   // instructions to scalarize, there's nothing to do. Collection may already
6327   // have occurred if we have a user-selected VF and are now computing the
6328   // expected cost for interleaving.
6329   if (VF.isScalar() || VF.isZero() ||
6330       InstsToScalarize.find(VF) != InstsToScalarize.end())
6331     return;
6332 
6333   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6334   // not profitable to scalarize any instructions, the presence of VF in the
6335   // map will indicate that we've analyzed it already.
6336   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6337 
6338   // Find all the instructions that are scalar with predication in the loop and
6339   // determine if it would be better to not if-convert the blocks they are in.
6340   // If so, we also record the instructions to scalarize.
6341   for (BasicBlock *BB : TheLoop->blocks()) {
6342     if (!blockNeedsPredication(BB))
6343       continue;
6344     for (Instruction &I : *BB)
6345       if (isScalarWithPredication(&I)) {
6346         ScalarCostsTy ScalarCosts;
6347         // Do not apply discount logic if hacked cost is needed
6348         // for emulated masked memrefs.
6349         if (!useEmulatedMaskMemRefHack(&I) &&
6350             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6351           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6352         // Remember that BB will remain after vectorization.
6353         PredicatedBBsAfterVectorization.insert(BB);
6354       }
6355   }
6356 }
6357 
6358 int LoopVectorizationCostModel::computePredInstDiscount(
6359     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
6360     ElementCount VF) {
6361   assert(!isUniformAfterVectorization(PredInst, VF) &&
6362          "Instruction marked uniform-after-vectorization will be predicated");
6363 
6364   // Initialize the discount to zero, meaning that the scalar version and the
6365   // vector version cost the same.
6366   int Discount = 0;
6367 
6368   // Holds instructions to analyze. The instructions we visit are mapped in
6369   // ScalarCosts. Those instructions are the ones that would be scalarized if
6370   // we find that the scalar version costs less.
6371   SmallVector<Instruction *, 8> Worklist;
6372 
6373   // Returns true if the given instruction can be scalarized.
6374   auto canBeScalarized = [&](Instruction *I) -> bool {
6375     // We only attempt to scalarize instructions forming a single-use chain
6376     // from the original predicated block that would otherwise be vectorized.
6377     // Although not strictly necessary, we give up on instructions we know will
6378     // already be scalar to avoid traversing chains that are unlikely to be
6379     // beneficial.
6380     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6381         isScalarAfterVectorization(I, VF))
6382       return false;
6383 
6384     // If the instruction is scalar with predication, it will be analyzed
6385     // separately. We ignore it within the context of PredInst.
6386     if (isScalarWithPredication(I))
6387       return false;
6388 
6389     // If any of the instruction's operands are uniform after vectorization,
6390     // the instruction cannot be scalarized. This prevents, for example, a
6391     // masked load from being scalarized.
6392     //
6393     // We assume we will only emit a value for lane zero of an instruction
6394     // marked uniform after vectorization, rather than VF identical values.
6395     // Thus, if we scalarize an instruction that uses a uniform, we would
6396     // create uses of values corresponding to the lanes we aren't emitting code
6397     // for. This behavior can be changed by allowing getScalarValue to clone
6398     // the lane zero values for uniforms rather than asserting.
6399     for (Use &U : I->operands())
6400       if (auto *J = dyn_cast<Instruction>(U.get()))
6401         if (isUniformAfterVectorization(J, VF))
6402           return false;
6403 
6404     // Otherwise, we can scalarize the instruction.
6405     return true;
6406   };
6407 
6408   // Compute the expected cost discount from scalarizing the entire expression
6409   // feeding the predicated instruction. We currently only consider expressions
6410   // that are single-use instruction chains.
6411   Worklist.push_back(PredInst);
6412   while (!Worklist.empty()) {
6413     Instruction *I = Worklist.pop_back_val();
6414 
6415     // If we've already analyzed the instruction, there's nothing to do.
6416     if (ScalarCosts.find(I) != ScalarCosts.end())
6417       continue;
6418 
6419     // Compute the cost of the vector instruction. Note that this cost already
6420     // includes the scalarization overhead of the predicated instruction.
6421     unsigned VectorCost = getInstructionCost(I, VF).first;
6422 
6423     // Compute the cost of the scalarized instruction. This cost is the cost of
6424     // the instruction as if it wasn't if-converted and instead remained in the
6425     // predicated block. We will scale this cost by block probability after
6426     // computing the scalarization overhead.
6427     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6428     unsigned ScalarCost =
6429         VF.getKnownMinValue() *
6430         getInstructionCost(I, ElementCount::getFixed(1)).first;
6431 
6432     // Compute the scalarization overhead of needed insertelement instructions
6433     // and phi nodes.
6434     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6435       ScalarCost += TTI.getScalarizationOverhead(
6436           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6437           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6438       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6439       ScalarCost +=
6440           VF.getKnownMinValue() *
6441           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6442     }
6443 
6444     // Compute the scalarization overhead of needed extractelement
6445     // instructions. For each of the instruction's operands, if the operand can
6446     // be scalarized, add it to the worklist; otherwise, account for the
6447     // overhead.
6448     for (Use &U : I->operands())
6449       if (auto *J = dyn_cast<Instruction>(U.get())) {
6450         assert(VectorType::isValidElementType(J->getType()) &&
6451                "Instruction has non-scalar type");
6452         if (canBeScalarized(J))
6453           Worklist.push_back(J);
6454         else if (needsExtract(J, VF)) {
6455           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6456           ScalarCost += TTI.getScalarizationOverhead(
6457               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6458               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6459         }
6460       }
6461 
6462     // Scale the total scalar cost by block probability.
6463     ScalarCost /= getReciprocalPredBlockProb();
6464 
6465     // Compute the discount. A non-negative discount means the vector version
6466     // of the instruction costs more, and scalarizing would be beneficial.
6467     Discount += VectorCost - ScalarCost;
6468     ScalarCosts[I] = ScalarCost;
6469   }
6470 
6471   return Discount;
6472 }
6473 
6474 LoopVectorizationCostModel::VectorizationCostTy
6475 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6476   VectorizationCostTy Cost;
6477 
6478   // For each block.
6479   for (BasicBlock *BB : TheLoop->blocks()) {
6480     VectorizationCostTy BlockCost;
6481 
6482     // For each instruction in the old loop.
6483     for (Instruction &I : BB->instructionsWithoutDebug()) {
6484       // Skip ignored values.
6485       if (ValuesToIgnore.count(&I) ||
6486           (VF.isVector() && VecValuesToIgnore.count(&I)))
6487         continue;
6488 
6489       VectorizationCostTy C = getInstructionCost(&I, VF);
6490 
6491       // Check if we should override the cost.
6492       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6493         C.first = ForceTargetInstructionCost;
6494 
6495       BlockCost.first += C.first;
6496       BlockCost.second |= C.second;
6497       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6498                         << " for VF " << VF << " For instruction: " << I
6499                         << '\n');
6500     }
6501 
6502     // If we are vectorizing a predicated block, it will have been
6503     // if-converted. This means that the block's instructions (aside from
6504     // stores and instructions that may divide by zero) will now be
6505     // unconditionally executed. For the scalar case, we may not always execute
6506     // the predicated block, if it is an if-else block. Thus, scale the block's
6507     // cost by the probability of executing it. blockNeedsPredication from
6508     // Legal is used so as to not include all blocks in tail folded loops.
6509     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6510       BlockCost.first /= getReciprocalPredBlockProb();
6511 
6512     Cost.first += BlockCost.first;
6513     Cost.second |= BlockCost.second;
6514   }
6515 
6516   return Cost;
6517 }
6518 
6519 /// Gets Address Access SCEV after verifying that the access pattern
6520 /// is loop invariant except the induction variable dependence.
6521 ///
6522 /// This SCEV can be sent to the Target in order to estimate the address
6523 /// calculation cost.
6524 static const SCEV *getAddressAccessSCEV(
6525               Value *Ptr,
6526               LoopVectorizationLegality *Legal,
6527               PredicatedScalarEvolution &PSE,
6528               const Loop *TheLoop) {
6529 
6530   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6531   if (!Gep)
6532     return nullptr;
6533 
6534   // We are looking for a gep with all loop invariant indices except for one
6535   // which should be an induction variable.
6536   auto SE = PSE.getSE();
6537   unsigned NumOperands = Gep->getNumOperands();
6538   for (unsigned i = 1; i < NumOperands; ++i) {
6539     Value *Opd = Gep->getOperand(i);
6540     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6541         !Legal->isInductionVariable(Opd))
6542       return nullptr;
6543   }
6544 
6545   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6546   return PSE.getSCEV(Ptr);
6547 }
6548 
6549 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6550   return Legal->hasStride(I->getOperand(0)) ||
6551          Legal->hasStride(I->getOperand(1));
6552 }
6553 
6554 unsigned
6555 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6556                                                         ElementCount VF) {
6557   assert(VF.isVector() &&
6558          "Scalarization cost of instruction implies vectorization.");
6559   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6560   Type *ValTy = getMemInstValueType(I);
6561   auto SE = PSE.getSE();
6562 
6563   unsigned AS = getLoadStoreAddressSpace(I);
6564   Value *Ptr = getLoadStorePointerOperand(I);
6565   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6566 
6567   // Figure out whether the access is strided and get the stride value
6568   // if it's known in compile time
6569   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6570 
6571   // Get the cost of the scalar memory instruction and address computation.
6572   unsigned Cost =
6573       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6574 
6575   // Don't pass *I here, since it is scalar but will actually be part of a
6576   // vectorized loop where the user of it is a vectorized instruction.
6577   const Align Alignment = getLoadStoreAlignment(I);
6578   Cost += VF.getKnownMinValue() *
6579           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6580                               AS, TTI::TCK_RecipThroughput);
6581 
6582   // Get the overhead of the extractelement and insertelement instructions
6583   // we might create due to scalarization.
6584   Cost += getScalarizationOverhead(I, VF);
6585 
6586   // If we have a predicated store, it may not be executed for each vector
6587   // lane. Scale the cost by the probability of executing the predicated
6588   // block.
6589   if (isPredicatedInst(I)) {
6590     Cost /= getReciprocalPredBlockProb();
6591 
6592     if (useEmulatedMaskMemRefHack(I))
6593       // Artificially setting to a high enough value to practically disable
6594       // vectorization with such operations.
6595       Cost = 3000000;
6596   }
6597 
6598   return Cost;
6599 }
6600 
6601 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6602                                                              ElementCount VF) {
6603   Type *ValTy = getMemInstValueType(I);
6604   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6605   Value *Ptr = getLoadStorePointerOperand(I);
6606   unsigned AS = getLoadStoreAddressSpace(I);
6607   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6608   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6609 
6610   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6611          "Stride should be 1 or -1 for consecutive memory access");
6612   const Align Alignment = getLoadStoreAlignment(I);
6613   unsigned Cost = 0;
6614   if (Legal->isMaskRequired(I))
6615     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6616                                       CostKind);
6617   else
6618     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6619                                 CostKind, I);
6620 
6621   bool Reverse = ConsecutiveStride < 0;
6622   if (Reverse)
6623     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6624   return Cost;
6625 }
6626 
6627 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6628                                                          ElementCount VF) {
6629   assert(Legal->isUniformMemOp(*I));
6630 
6631   Type *ValTy = getMemInstValueType(I);
6632   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6633   const Align Alignment = getLoadStoreAlignment(I);
6634   unsigned AS = getLoadStoreAddressSpace(I);
6635   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6636   if (isa<LoadInst>(I)) {
6637     return TTI.getAddressComputationCost(ValTy) +
6638            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6639                                CostKind) +
6640            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6641   }
6642   StoreInst *SI = cast<StoreInst>(I);
6643 
6644   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6645   return TTI.getAddressComputationCost(ValTy) +
6646          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6647                              CostKind) +
6648          (isLoopInvariantStoreValue
6649               ? 0
6650               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6651                                        VF.getKnownMinValue() - 1));
6652 }
6653 
6654 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6655                                                           ElementCount VF) {
6656   Type *ValTy = getMemInstValueType(I);
6657   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6658   const Align Alignment = getLoadStoreAlignment(I);
6659   const Value *Ptr = getLoadStorePointerOperand(I);
6660 
6661   return TTI.getAddressComputationCost(VectorTy) +
6662          TTI.getGatherScatterOpCost(
6663              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6664              TargetTransformInfo::TCK_RecipThroughput, I);
6665 }
6666 
6667 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6668                                                             ElementCount VF) {
6669   Type *ValTy = getMemInstValueType(I);
6670   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6671   unsigned AS = getLoadStoreAddressSpace(I);
6672 
6673   auto Group = getInterleavedAccessGroup(I);
6674   assert(Group && "Fail to get an interleaved access group.");
6675 
6676   unsigned InterleaveFactor = Group->getFactor();
6677   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6678   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6679 
6680   // Holds the indices of existing members in an interleaved load group.
6681   // An interleaved store group doesn't need this as it doesn't allow gaps.
6682   SmallVector<unsigned, 4> Indices;
6683   if (isa<LoadInst>(I)) {
6684     for (unsigned i = 0; i < InterleaveFactor; i++)
6685       if (Group->getMember(i))
6686         Indices.push_back(i);
6687   }
6688 
6689   // Calculate the cost of the whole interleaved group.
6690   bool UseMaskForGaps =
6691       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6692   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6693       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6694       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6695 
6696   if (Group->isReverse()) {
6697     // TODO: Add support for reversed masked interleaved access.
6698     assert(!Legal->isMaskRequired(I) &&
6699            "Reverse masked interleaved access not supported.");
6700     Cost += Group->getNumMembers() *
6701             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6702   }
6703   return Cost;
6704 }
6705 
6706 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6707                                                               ElementCount VF) {
6708   // Calculate scalar cost only. Vectorization cost should be ready at this
6709   // moment.
6710   if (VF.isScalar()) {
6711     Type *ValTy = getMemInstValueType(I);
6712     const Align Alignment = getLoadStoreAlignment(I);
6713     unsigned AS = getLoadStoreAddressSpace(I);
6714 
6715     return TTI.getAddressComputationCost(ValTy) +
6716            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6717                                TTI::TCK_RecipThroughput, I);
6718   }
6719   return getWideningCost(I, VF);
6720 }
6721 
6722 LoopVectorizationCostModel::VectorizationCostTy
6723 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6724                                                ElementCount VF) {
6725   // If we know that this instruction will remain uniform, check the cost of
6726   // the scalar version.
6727   if (isUniformAfterVectorization(I, VF))
6728     VF = ElementCount::getFixed(1);
6729 
6730   if (VF.isVector() && isProfitableToScalarize(I, VF))
6731     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6732 
6733   // Forced scalars do not have any scalarization overhead.
6734   auto ForcedScalar = ForcedScalars.find(VF);
6735   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6736     auto InstSet = ForcedScalar->second;
6737     if (InstSet.count(I))
6738       return VectorizationCostTy(
6739           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6740            VF.getKnownMinValue()),
6741           false);
6742   }
6743 
6744   Type *VectorTy;
6745   unsigned C = getInstructionCost(I, VF, VectorTy);
6746 
6747   bool TypeNotScalarized =
6748       VF.isVector() && VectorTy->isVectorTy() &&
6749       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6750   return VectorizationCostTy(C, TypeNotScalarized);
6751 }
6752 
6753 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6754                                                               ElementCount VF) {
6755 
6756   assert(!VF.isScalable() &&
6757          "cannot compute scalarization overhead for scalable vectorization");
6758   if (VF.isScalar())
6759     return 0;
6760 
6761   unsigned Cost = 0;
6762   Type *RetTy = ToVectorTy(I->getType(), VF);
6763   if (!RetTy->isVoidTy() &&
6764       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6765     Cost += TTI.getScalarizationOverhead(
6766         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6767         true, false);
6768 
6769   // Some targets keep addresses scalar.
6770   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6771     return Cost;
6772 
6773   // Some targets support efficient element stores.
6774   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6775     return Cost;
6776 
6777   // Collect operands to consider.
6778   CallInst *CI = dyn_cast<CallInst>(I);
6779   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6780 
6781   // Skip operands that do not require extraction/scalarization and do not incur
6782   // any overhead.
6783   return Cost + TTI.getOperandsScalarizationOverhead(
6784                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6785 }
6786 
6787 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6788   if (VF.isScalar())
6789     return;
6790   NumPredStores = 0;
6791   for (BasicBlock *BB : TheLoop->blocks()) {
6792     // For each instruction in the old loop.
6793     for (Instruction &I : *BB) {
6794       Value *Ptr =  getLoadStorePointerOperand(&I);
6795       if (!Ptr)
6796         continue;
6797 
6798       // TODO: We should generate better code and update the cost model for
6799       // predicated uniform stores. Today they are treated as any other
6800       // predicated store (see added test cases in
6801       // invariant-store-vectorization.ll).
6802       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6803         NumPredStores++;
6804 
6805       if (Legal->isUniformMemOp(I)) {
6806         // TODO: Avoid replicating loads and stores instead of
6807         // relying on instcombine to remove them.
6808         // Load: Scalar load + broadcast
6809         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6810         unsigned Cost = getUniformMemOpCost(&I, VF);
6811         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6812         continue;
6813       }
6814 
6815       // We assume that widening is the best solution when possible.
6816       if (memoryInstructionCanBeWidened(&I, VF)) {
6817         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6818         int ConsecutiveStride =
6819                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6820         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6821                "Expected consecutive stride.");
6822         InstWidening Decision =
6823             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6824         setWideningDecision(&I, VF, Decision, Cost);
6825         continue;
6826       }
6827 
6828       // Choose between Interleaving, Gather/Scatter or Scalarization.
6829       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6830       unsigned NumAccesses = 1;
6831       if (isAccessInterleaved(&I)) {
6832         auto Group = getInterleavedAccessGroup(&I);
6833         assert(Group && "Fail to get an interleaved access group.");
6834 
6835         // Make one decision for the whole group.
6836         if (getWideningDecision(&I, VF) != CM_Unknown)
6837           continue;
6838 
6839         NumAccesses = Group->getNumMembers();
6840         if (interleavedAccessCanBeWidened(&I, VF))
6841           InterleaveCost = getInterleaveGroupCost(&I, VF);
6842       }
6843 
6844       unsigned GatherScatterCost =
6845           isLegalGatherOrScatter(&I)
6846               ? getGatherScatterCost(&I, VF) * NumAccesses
6847               : std::numeric_limits<unsigned>::max();
6848 
6849       unsigned ScalarizationCost =
6850           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6851 
6852       // Choose better solution for the current VF,
6853       // write down this decision and use it during vectorization.
6854       unsigned Cost;
6855       InstWidening Decision;
6856       if (InterleaveCost <= GatherScatterCost &&
6857           InterleaveCost < ScalarizationCost) {
6858         Decision = CM_Interleave;
6859         Cost = InterleaveCost;
6860       } else if (GatherScatterCost < ScalarizationCost) {
6861         Decision = CM_GatherScatter;
6862         Cost = GatherScatterCost;
6863       } else {
6864         Decision = CM_Scalarize;
6865         Cost = ScalarizationCost;
6866       }
6867       // If the instructions belongs to an interleave group, the whole group
6868       // receives the same decision. The whole group receives the cost, but
6869       // the cost will actually be assigned to one instruction.
6870       if (auto Group = getInterleavedAccessGroup(&I))
6871         setWideningDecision(Group, VF, Decision, Cost);
6872       else
6873         setWideningDecision(&I, VF, Decision, Cost);
6874     }
6875   }
6876 
6877   // Make sure that any load of address and any other address computation
6878   // remains scalar unless there is gather/scatter support. This avoids
6879   // inevitable extracts into address registers, and also has the benefit of
6880   // activating LSR more, since that pass can't optimize vectorized
6881   // addresses.
6882   if (TTI.prefersVectorizedAddressing())
6883     return;
6884 
6885   // Start with all scalar pointer uses.
6886   SmallPtrSet<Instruction *, 8> AddrDefs;
6887   for (BasicBlock *BB : TheLoop->blocks())
6888     for (Instruction &I : *BB) {
6889       Instruction *PtrDef =
6890         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6891       if (PtrDef && TheLoop->contains(PtrDef) &&
6892           getWideningDecision(&I, VF) != CM_GatherScatter)
6893         AddrDefs.insert(PtrDef);
6894     }
6895 
6896   // Add all instructions used to generate the addresses.
6897   SmallVector<Instruction *, 4> Worklist;
6898   for (auto *I : AddrDefs)
6899     Worklist.push_back(I);
6900   while (!Worklist.empty()) {
6901     Instruction *I = Worklist.pop_back_val();
6902     for (auto &Op : I->operands())
6903       if (auto *InstOp = dyn_cast<Instruction>(Op))
6904         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6905             AddrDefs.insert(InstOp).second)
6906           Worklist.push_back(InstOp);
6907   }
6908 
6909   for (auto *I : AddrDefs) {
6910     if (isa<LoadInst>(I)) {
6911       // Setting the desired widening decision should ideally be handled in
6912       // by cost functions, but since this involves the task of finding out
6913       // if the loaded register is involved in an address computation, it is
6914       // instead changed here when we know this is the case.
6915       InstWidening Decision = getWideningDecision(I, VF);
6916       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6917         // Scalarize a widened load of address.
6918         setWideningDecision(
6919             I, VF, CM_Scalarize,
6920             (VF.getKnownMinValue() *
6921              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6922       else if (auto Group = getInterleavedAccessGroup(I)) {
6923         // Scalarize an interleave group of address loads.
6924         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6925           if (Instruction *Member = Group->getMember(I))
6926             setWideningDecision(
6927                 Member, VF, CM_Scalarize,
6928                 (VF.getKnownMinValue() *
6929                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6930         }
6931       }
6932     } else
6933       // Make sure I gets scalarized and a cost estimate without
6934       // scalarization overhead.
6935       ForcedScalars[VF].insert(I);
6936   }
6937 }
6938 
6939 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6940                                                         ElementCount VF,
6941                                                         Type *&VectorTy) {
6942   Type *RetTy = I->getType();
6943   if (canTruncateToMinimalBitwidth(I, VF))
6944     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6945   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6946   auto SE = PSE.getSE();
6947   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6948 
6949   // TODO: We need to estimate the cost of intrinsic calls.
6950   switch (I->getOpcode()) {
6951   case Instruction::GetElementPtr:
6952     // We mark this instruction as zero-cost because the cost of GEPs in
6953     // vectorized code depends on whether the corresponding memory instruction
6954     // is scalarized or not. Therefore, we handle GEPs with the memory
6955     // instruction cost.
6956     return 0;
6957   case Instruction::Br: {
6958     // In cases of scalarized and predicated instructions, there will be VF
6959     // predicated blocks in the vectorized loop. Each branch around these
6960     // blocks requires also an extract of its vector compare i1 element.
6961     bool ScalarPredicatedBB = false;
6962     BranchInst *BI = cast<BranchInst>(I);
6963     if (VF.isVector() && BI->isConditional() &&
6964         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6965          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6966       ScalarPredicatedBB = true;
6967 
6968     if (ScalarPredicatedBB) {
6969       // Return cost for branches around scalarized and predicated blocks.
6970       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6971       auto *Vec_i1Ty =
6972           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6973       return (TTI.getScalarizationOverhead(
6974                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6975                   false, true) +
6976               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
6977                VF.getKnownMinValue()));
6978     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6979       // The back-edge branch will remain, as will all scalar branches.
6980       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6981     else
6982       // This branch will be eliminated by if-conversion.
6983       return 0;
6984     // Note: We currently assume zero cost for an unconditional branch inside
6985     // a predicated block since it will become a fall-through, although we
6986     // may decide in the future to call TTI for all branches.
6987   }
6988   case Instruction::PHI: {
6989     auto *Phi = cast<PHINode>(I);
6990 
6991     // First-order recurrences are replaced by vector shuffles inside the loop.
6992     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6993     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6994       return TTI.getShuffleCost(
6995           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
6996           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
6997 
6998     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6999     // converted into select instructions. We require N - 1 selects per phi
7000     // node, where N is the number of incoming values.
7001     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7002       return (Phi->getNumIncomingValues() - 1) *
7003              TTI.getCmpSelInstrCost(
7004                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7005                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7006                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7007 
7008     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7009   }
7010   case Instruction::UDiv:
7011   case Instruction::SDiv:
7012   case Instruction::URem:
7013   case Instruction::SRem:
7014     // If we have a predicated instruction, it may not be executed for each
7015     // vector lane. Get the scalarization cost and scale this amount by the
7016     // probability of executing the predicated block. If the instruction is not
7017     // predicated, we fall through to the next case.
7018     if (VF.isVector() && isScalarWithPredication(I)) {
7019       unsigned Cost = 0;
7020 
7021       // These instructions have a non-void type, so account for the phi nodes
7022       // that we will create. This cost is likely to be zero. The phi node
7023       // cost, if any, should be scaled by the block probability because it
7024       // models a copy at the end of each predicated block.
7025       Cost += VF.getKnownMinValue() *
7026               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7027 
7028       // The cost of the non-predicated instruction.
7029       Cost += VF.getKnownMinValue() *
7030               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7031 
7032       // The cost of insertelement and extractelement instructions needed for
7033       // scalarization.
7034       Cost += getScalarizationOverhead(I, VF);
7035 
7036       // Scale the cost by the probability of executing the predicated blocks.
7037       // This assumes the predicated block for each vector lane is equally
7038       // likely.
7039       return Cost / getReciprocalPredBlockProb();
7040     }
7041     LLVM_FALLTHROUGH;
7042   case Instruction::Add:
7043   case Instruction::FAdd:
7044   case Instruction::Sub:
7045   case Instruction::FSub:
7046   case Instruction::Mul:
7047   case Instruction::FMul:
7048   case Instruction::FDiv:
7049   case Instruction::FRem:
7050   case Instruction::Shl:
7051   case Instruction::LShr:
7052   case Instruction::AShr:
7053   case Instruction::And:
7054   case Instruction::Or:
7055   case Instruction::Xor: {
7056     // Since we will replace the stride by 1 the multiplication should go away.
7057     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7058       return 0;
7059     // Certain instructions can be cheaper to vectorize if they have a constant
7060     // second vector operand. One example of this are shifts on x86.
7061     Value *Op2 = I->getOperand(1);
7062     TargetTransformInfo::OperandValueProperties Op2VP;
7063     TargetTransformInfo::OperandValueKind Op2VK =
7064         TTI.getOperandInfo(Op2, Op2VP);
7065     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7066       Op2VK = TargetTransformInfo::OK_UniformValue;
7067 
7068     SmallVector<const Value *, 4> Operands(I->operand_values());
7069     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7070     return N * TTI.getArithmeticInstrCost(
7071                    I->getOpcode(), VectorTy, CostKind,
7072                    TargetTransformInfo::OK_AnyValue,
7073                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7074   }
7075   case Instruction::FNeg: {
7076     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7077     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7078     return N * TTI.getArithmeticInstrCost(
7079                    I->getOpcode(), VectorTy, CostKind,
7080                    TargetTransformInfo::OK_AnyValue,
7081                    TargetTransformInfo::OK_AnyValue,
7082                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
7083                    I->getOperand(0), I);
7084   }
7085   case Instruction::Select: {
7086     SelectInst *SI = cast<SelectInst>(I);
7087     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7088     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7089     Type *CondTy = SI->getCondition()->getType();
7090     if (!ScalarCond) {
7091       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7092       CondTy = VectorType::get(CondTy, VF);
7093     }
7094     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7095                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7096   }
7097   case Instruction::ICmp:
7098   case Instruction::FCmp: {
7099     Type *ValTy = I->getOperand(0)->getType();
7100     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7101     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7102       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7103     VectorTy = ToVectorTy(ValTy, VF);
7104     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7105                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7106   }
7107   case Instruction::Store:
7108   case Instruction::Load: {
7109     ElementCount Width = VF;
7110     if (Width.isVector()) {
7111       InstWidening Decision = getWideningDecision(I, Width);
7112       assert(Decision != CM_Unknown &&
7113              "CM decision should be taken at this point");
7114       if (Decision == CM_Scalarize)
7115         Width = ElementCount::getFixed(1);
7116     }
7117     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7118     return getMemoryInstructionCost(I, VF);
7119   }
7120   case Instruction::ZExt:
7121   case Instruction::SExt:
7122   case Instruction::FPToUI:
7123   case Instruction::FPToSI:
7124   case Instruction::FPExt:
7125   case Instruction::PtrToInt:
7126   case Instruction::IntToPtr:
7127   case Instruction::SIToFP:
7128   case Instruction::UIToFP:
7129   case Instruction::Trunc:
7130   case Instruction::FPTrunc:
7131   case Instruction::BitCast: {
7132     // Computes the CastContextHint from a Load/Store instruction.
7133     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7134       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7135              "Expected a load or a store!");
7136 
7137       if (VF.isScalar() || !TheLoop->contains(I))
7138         return TTI::CastContextHint::Normal;
7139 
7140       switch (getWideningDecision(I, VF)) {
7141       case LoopVectorizationCostModel::CM_GatherScatter:
7142         return TTI::CastContextHint::GatherScatter;
7143       case LoopVectorizationCostModel::CM_Interleave:
7144         return TTI::CastContextHint::Interleave;
7145       case LoopVectorizationCostModel::CM_Scalarize:
7146       case LoopVectorizationCostModel::CM_Widen:
7147         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7148                                         : TTI::CastContextHint::Normal;
7149       case LoopVectorizationCostModel::CM_Widen_Reverse:
7150         return TTI::CastContextHint::Reversed;
7151       case LoopVectorizationCostModel::CM_Unknown:
7152         llvm_unreachable("Instr did not go through cost modelling?");
7153       }
7154 
7155       llvm_unreachable("Unhandled case!");
7156     };
7157 
7158     unsigned Opcode = I->getOpcode();
7159     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7160     // For Trunc, the context is the only user, which must be a StoreInst.
7161     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7162       if (I->hasOneUse())
7163         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7164           CCH = ComputeCCH(Store);
7165     }
7166     // For Z/Sext, the context is the operand, which must be a LoadInst.
7167     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7168              Opcode == Instruction::FPExt) {
7169       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7170         CCH = ComputeCCH(Load);
7171     }
7172 
7173     // We optimize the truncation of induction variables having constant
7174     // integer steps. The cost of these truncations is the same as the scalar
7175     // operation.
7176     if (isOptimizableIVTruncate(I, VF)) {
7177       auto *Trunc = cast<TruncInst>(I);
7178       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7179                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7180     }
7181 
7182     Type *SrcScalarTy = I->getOperand(0)->getType();
7183     Type *SrcVecTy =
7184         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7185     if (canTruncateToMinimalBitwidth(I, VF)) {
7186       // This cast is going to be shrunk. This may remove the cast or it might
7187       // turn it into slightly different cast. For example, if MinBW == 16,
7188       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7189       //
7190       // Calculate the modified src and dest types.
7191       Type *MinVecTy = VectorTy;
7192       if (Opcode == Instruction::Trunc) {
7193         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7194         VectorTy =
7195             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7196       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7197         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7198         VectorTy =
7199             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7200       }
7201     }
7202 
7203     assert(!VF.isScalable() && "VF is assumed to be non scalable");
7204     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7205     return N *
7206            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7207   }
7208   case Instruction::Call: {
7209     bool NeedToScalarize;
7210     CallInst *CI = cast<CallInst>(I);
7211     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7212     if (getVectorIntrinsicIDForCall(CI, TLI))
7213       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
7214     return CallCost;
7215   }
7216   case Instruction::ExtractValue: {
7217     InstructionCost ExtractCost =
7218         TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7219     assert(ExtractCost.isValid() && "Invalid cost for ExtractValue");
7220     return *(ExtractCost.getValue());
7221   }
7222   default:
7223     // The cost of executing VF copies of the scalar instruction. This opcode
7224     // is unknown. Assume that it is the same as 'mul'.
7225     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
7226                                        Instruction::Mul, VectorTy, CostKind) +
7227            getScalarizationOverhead(I, VF);
7228   } // end of switch.
7229 }
7230 
7231 char LoopVectorize::ID = 0;
7232 
7233 static const char lv_name[] = "Loop Vectorization";
7234 
7235 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7236 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7237 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7238 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7239 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7240 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7241 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7242 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7243 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7244 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7245 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7246 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7247 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7248 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7249 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7250 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7251 
7252 namespace llvm {
7253 
7254 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7255 
7256 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7257                               bool VectorizeOnlyWhenForced) {
7258   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7259 }
7260 
7261 } // end namespace llvm
7262 
7263 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7264   // Check if the pointer operand of a load or store instruction is
7265   // consecutive.
7266   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7267     return Legal->isConsecutivePtr(Ptr);
7268   return false;
7269 }
7270 
7271 void LoopVectorizationCostModel::collectValuesToIgnore() {
7272   // Ignore ephemeral values.
7273   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7274 
7275   // Ignore type-promoting instructions we identified during reduction
7276   // detection.
7277   for (auto &Reduction : Legal->getReductionVars()) {
7278     RecurrenceDescriptor &RedDes = Reduction.second;
7279     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7280     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7281   }
7282   // Ignore type-casting instructions we identified during induction
7283   // detection.
7284   for (auto &Induction : Legal->getInductionVars()) {
7285     InductionDescriptor &IndDes = Induction.second;
7286     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7287     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7288   }
7289 }
7290 
7291 void LoopVectorizationCostModel::collectInLoopReductions() {
7292   for (auto &Reduction : Legal->getReductionVars()) {
7293     PHINode *Phi = Reduction.first;
7294     RecurrenceDescriptor &RdxDesc = Reduction.second;
7295 
7296     // We don't collect reductions that are type promoted (yet).
7297     if (RdxDesc.getRecurrenceType() != Phi->getType())
7298       continue;
7299 
7300     // If the target would prefer this reduction to happen "in-loop", then we
7301     // want to record it as such.
7302     unsigned Opcode = RdxDesc.getRecurrenceBinOp();
7303     if (!PreferInLoopReductions &&
7304         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7305                                    TargetTransformInfo::ReductionFlags()))
7306       continue;
7307 
7308     // Check that we can correctly put the reductions into the loop, by
7309     // finding the chain of operations that leads from the phi to the loop
7310     // exit value.
7311     SmallVector<Instruction *, 4> ReductionOperations =
7312         RdxDesc.getReductionOpChain(Phi, TheLoop);
7313     bool InLoop = !ReductionOperations.empty();
7314     if (InLoop)
7315       InLoopReductionChains[Phi] = ReductionOperations;
7316     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7317                       << " reduction for phi: " << *Phi << "\n");
7318   }
7319 }
7320 
7321 // TODO: we could return a pair of values that specify the max VF and
7322 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7323 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7324 // doesn't have a cost model that can choose which plan to execute if
7325 // more than one is generated.
7326 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7327                                  LoopVectorizationCostModel &CM) {
7328   unsigned WidestType;
7329   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7330   return WidestVectorRegBits / WidestType;
7331 }
7332 
7333 VectorizationFactor
7334 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7335   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7336   ElementCount VF = UserVF;
7337   // Outer loop handling: They may require CFG and instruction level
7338   // transformations before even evaluating whether vectorization is profitable.
7339   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7340   // the vectorization pipeline.
7341   if (!OrigLoop->isInnermost()) {
7342     // If the user doesn't provide a vectorization factor, determine a
7343     // reasonable one.
7344     if (UserVF.isZero()) {
7345       VF = ElementCount::getFixed(
7346           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
7347       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7348 
7349       // Make sure we have a VF > 1 for stress testing.
7350       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7351         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7352                           << "overriding computed VF.\n");
7353         VF = ElementCount::getFixed(4);
7354       }
7355     }
7356     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7357     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7358            "VF needs to be a power of two");
7359     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7360                       << "VF " << VF << " to build VPlans.\n");
7361     buildVPlans(VF, VF);
7362 
7363     // For VPlan build stress testing, we bail out after VPlan construction.
7364     if (VPlanBuildStressTest)
7365       return VectorizationFactor::Disabled();
7366 
7367     return {VF, 0 /*Cost*/};
7368   }
7369 
7370   LLVM_DEBUG(
7371       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7372                 "VPlan-native path.\n");
7373   return VectorizationFactor::Disabled();
7374 }
7375 
7376 Optional<VectorizationFactor>
7377 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7378   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7379   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7380   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7381     return None;
7382 
7383   // Invalidate interleave groups if all blocks of loop will be predicated.
7384   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7385       !useMaskedInterleavedAccesses(*TTI)) {
7386     LLVM_DEBUG(
7387         dbgs()
7388         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7389            "which requires masked-interleaved support.\n");
7390     if (CM.InterleaveInfo.invalidateGroups())
7391       // Invalidating interleave groups also requires invalidating all decisions
7392       // based on them, which includes widening decisions and uniform and scalar
7393       // values.
7394       CM.invalidateCostModelingDecisions();
7395   }
7396 
7397   ElementCount MaxVF = MaybeMaxVF.getValue();
7398   assert(MaxVF.isNonZero() && "MaxVF is zero.");
7399 
7400   if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) {
7401     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7402     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7403            "VF needs to be a power of two");
7404     // Collect the instructions (and their associated costs) that will be more
7405     // profitable to scalarize.
7406     CM.selectUserVectorizationFactor(UserVF);
7407     CM.collectInLoopReductions();
7408     buildVPlansWithVPRecipes(UserVF, UserVF);
7409     LLVM_DEBUG(printPlans(dbgs()));
7410     return {{UserVF, 0}};
7411   }
7412 
7413   assert(!MaxVF.isScalable() &&
7414          "Scalable vectors not yet supported beyond this point");
7415 
7416   for (ElementCount VF = ElementCount::getFixed(1);
7417        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7418     // Collect Uniform and Scalar instructions after vectorization with VF.
7419     CM.collectUniformsAndScalars(VF);
7420 
7421     // Collect the instructions (and their associated costs) that will be more
7422     // profitable to scalarize.
7423     if (VF.isVector())
7424       CM.collectInstsToScalarize(VF);
7425   }
7426 
7427   CM.collectInLoopReductions();
7428 
7429   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7430   LLVM_DEBUG(printPlans(dbgs()));
7431   if (MaxVF.isScalar())
7432     return VectorizationFactor::Disabled();
7433 
7434   // Select the optimal vectorization factor.
7435   return CM.selectVectorizationFactor(MaxVF);
7436 }
7437 
7438 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7439   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7440                     << '\n');
7441   BestVF = VF;
7442   BestUF = UF;
7443 
7444   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7445     return !Plan->hasVF(VF);
7446   });
7447   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7448 }
7449 
7450 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7451                                            DominatorTree *DT) {
7452   // Perform the actual loop transformation.
7453 
7454   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7455   VPCallbackILV CallbackILV(ILV);
7456 
7457   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7458 
7459   VPTransformState State{*BestVF, BestUF,      LI,
7460                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7461                          &ILV,    CallbackILV};
7462   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7463   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7464   State.CanonicalIV = ILV.Induction;
7465 
7466   ILV.printDebugTracesAtStart();
7467 
7468   //===------------------------------------------------===//
7469   //
7470   // Notice: any optimization or new instruction that go
7471   // into the code below should also be implemented in
7472   // the cost-model.
7473   //
7474   //===------------------------------------------------===//
7475 
7476   // 2. Copy and widen instructions from the old loop into the new loop.
7477   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7478   VPlans.front()->execute(&State);
7479 
7480   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7481   //    predication, updating analyses.
7482   ILV.fixVectorizedLoop();
7483 
7484   ILV.printDebugTracesAtEnd();
7485 }
7486 
7487 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7488     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7489 
7490   // We create new control-flow for the vectorized loop, so the original exit
7491   // conditions will be dead after vectorization if it's only used by the
7492   // terminator
7493   SmallVector<BasicBlock*> ExitingBlocks;
7494   OrigLoop->getExitingBlocks(ExitingBlocks);
7495   for (auto *BB : ExitingBlocks) {
7496     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7497     if (!Cmp || !Cmp->hasOneUse())
7498       continue;
7499 
7500     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7501     if (!DeadInstructions.insert(Cmp).second)
7502       continue;
7503 
7504     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7505     // TODO: can recurse through operands in general
7506     for (Value *Op : Cmp->operands()) {
7507       if (isa<TruncInst>(Op) && Op->hasOneUse())
7508           DeadInstructions.insert(cast<Instruction>(Op));
7509     }
7510   }
7511 
7512   // We create new "steps" for induction variable updates to which the original
7513   // induction variables map. An original update instruction will be dead if
7514   // all its users except the induction variable are dead.
7515   auto *Latch = OrigLoop->getLoopLatch();
7516   for (auto &Induction : Legal->getInductionVars()) {
7517     PHINode *Ind = Induction.first;
7518     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7519 
7520     // If the tail is to be folded by masking, the primary induction variable,
7521     // if exists, isn't dead: it will be used for masking. Don't kill it.
7522     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7523       continue;
7524 
7525     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7526           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7527         }))
7528       DeadInstructions.insert(IndUpdate);
7529 
7530     // We record as "Dead" also the type-casting instructions we had identified
7531     // during induction analysis. We don't need any handling for them in the
7532     // vectorized loop because we have proven that, under a proper runtime
7533     // test guarding the vectorized loop, the value of the phi, and the casted
7534     // value of the phi, are the same. The last instruction in this casting chain
7535     // will get its scalar/vector/widened def from the scalar/vector/widened def
7536     // of the respective phi node. Any other casts in the induction def-use chain
7537     // have no other uses outside the phi update chain, and will be ignored.
7538     InductionDescriptor &IndDes = Induction.second;
7539     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7540     DeadInstructions.insert(Casts.begin(), Casts.end());
7541   }
7542 }
7543 
7544 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7545 
7546 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7547 
7548 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7549                                         Instruction::BinaryOps BinOp) {
7550   // When unrolling and the VF is 1, we only need to add a simple scalar.
7551   Type *Ty = Val->getType();
7552   assert(!Ty->isVectorTy() && "Val must be a scalar");
7553 
7554   if (Ty->isFloatingPointTy()) {
7555     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7556 
7557     // Floating point operations had to be 'fast' to enable the unrolling.
7558     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7559     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7560   }
7561   Constant *C = ConstantInt::get(Ty, StartIdx);
7562   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7563 }
7564 
7565 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7566   SmallVector<Metadata *, 4> MDs;
7567   // Reserve first location for self reference to the LoopID metadata node.
7568   MDs.push_back(nullptr);
7569   bool IsUnrollMetadata = false;
7570   MDNode *LoopID = L->getLoopID();
7571   if (LoopID) {
7572     // First find existing loop unrolling disable metadata.
7573     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7574       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7575       if (MD) {
7576         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7577         IsUnrollMetadata =
7578             S && S->getString().startswith("llvm.loop.unroll.disable");
7579       }
7580       MDs.push_back(LoopID->getOperand(i));
7581     }
7582   }
7583 
7584   if (!IsUnrollMetadata) {
7585     // Add runtime unroll disable metadata.
7586     LLVMContext &Context = L->getHeader()->getContext();
7587     SmallVector<Metadata *, 1> DisableOperands;
7588     DisableOperands.push_back(
7589         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7590     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7591     MDs.push_back(DisableNode);
7592     MDNode *NewLoopID = MDNode::get(Context, MDs);
7593     // Set operand 0 to refer to the loop id itself.
7594     NewLoopID->replaceOperandWith(0, NewLoopID);
7595     L->setLoopID(NewLoopID);
7596   }
7597 }
7598 
7599 //===--------------------------------------------------------------------===//
7600 // EpilogueVectorizerMainLoop
7601 //===--------------------------------------------------------------------===//
7602 
7603 /// This function is partially responsible for generating the control flow
7604 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7605 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7606   MDNode *OrigLoopID = OrigLoop->getLoopID();
7607   Loop *Lp = createVectorLoopSkeleton("");
7608 
7609   // Generate the code to check the minimum iteration count of the vector
7610   // epilogue (see below).
7611   EPI.EpilogueIterationCountCheck =
7612       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
7613   EPI.EpilogueIterationCountCheck->setName("iter.check");
7614 
7615   // Generate the code to check any assumptions that we've made for SCEV
7616   // expressions.
7617   BasicBlock *SavedPreHeader = LoopVectorPreHeader;
7618   emitSCEVChecks(Lp, LoopScalarPreHeader);
7619 
7620   // If a safety check was generated save it.
7621   if (SavedPreHeader != LoopVectorPreHeader)
7622     EPI.SCEVSafetyCheck = SavedPreHeader;
7623 
7624   // Generate the code that checks at runtime if arrays overlap. We put the
7625   // checks into a separate block to make the more common case of few elements
7626   // faster.
7627   SavedPreHeader = LoopVectorPreHeader;
7628   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
7629 
7630   // If a safety check was generated save/overwite it.
7631   if (SavedPreHeader != LoopVectorPreHeader)
7632     EPI.MemSafetyCheck = SavedPreHeader;
7633 
7634   // Generate the iteration count check for the main loop, *after* the check
7635   // for the epilogue loop, so that the path-length is shorter for the case
7636   // that goes directly through the vector epilogue. The longer-path length for
7637   // the main loop is compensated for, by the gain from vectorizing the larger
7638   // trip count. Note: the branch will get updated later on when we vectorize
7639   // the epilogue.
7640   EPI.MainLoopIterationCountCheck =
7641       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
7642 
7643   // Generate the induction variable.
7644   OldInduction = Legal->getPrimaryInduction();
7645   Type *IdxTy = Legal->getWidestInductionType();
7646   Value *StartIdx = ConstantInt::get(IdxTy, 0);
7647   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7648   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7649   EPI.VectorTripCount = CountRoundDown;
7650   Induction =
7651       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7652                               getDebugLocFromInstOrOperands(OldInduction));
7653 
7654   // Skip induction resume value creation here because they will be created in
7655   // the second pass. If we created them here, they wouldn't be used anyway,
7656   // because the vplan in the second pass still contains the inductions from the
7657   // original loop.
7658 
7659   return completeLoopSkeleton(Lp, OrigLoopID);
7660 }
7661 
7662 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7663   LLVM_DEBUG({
7664     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7665            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7666            << ", Main Loop UF:" << EPI.MainLoopUF
7667            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7668            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7669   });
7670 }
7671 
7672 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7673   DEBUG_WITH_TYPE(VerboseDebug, {
7674     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
7675   });
7676 }
7677 
7678 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
7679     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
7680   assert(L && "Expected valid Loop.");
7681   assert(Bypass && "Expected valid bypass basic block.");
7682   unsigned VFactor =
7683       ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
7684   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7685   Value *Count = getOrCreateTripCount(L);
7686   // Reuse existing vector loop preheader for TC checks.
7687   // Note that new preheader block is generated for vector loop.
7688   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7689   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7690 
7691   // Generate code to check if the loop's trip count is less than VF * UF of the
7692   // main vector loop.
7693   auto P =
7694       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7695 
7696   Value *CheckMinIters = Builder.CreateICmp(
7697       P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
7698       "min.iters.check");
7699 
7700   if (!ForEpilogue)
7701     TCCheckBlock->setName("vector.main.loop.iter.check");
7702 
7703   // Create new preheader for vector loop.
7704   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7705                                    DT, LI, nullptr, "vector.ph");
7706 
7707   if (ForEpilogue) {
7708     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7709                                  DT->getNode(Bypass)->getIDom()) &&
7710            "TC check is expected to dominate Bypass");
7711 
7712     // Update dominator for Bypass & LoopExit.
7713     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7714     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7715 
7716     LoopBypassBlocks.push_back(TCCheckBlock);
7717 
7718     // Save the trip count so we don't have to regenerate it in the
7719     // vec.epilog.iter.check. This is safe to do because the trip count
7720     // generated here dominates the vector epilog iter check.
7721     EPI.TripCount = Count;
7722   }
7723 
7724   ReplaceInstWithInst(
7725       TCCheckBlock->getTerminator(),
7726       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7727 
7728   return TCCheckBlock;
7729 }
7730 
7731 //===--------------------------------------------------------------------===//
7732 // EpilogueVectorizerEpilogueLoop
7733 //===--------------------------------------------------------------------===//
7734 
7735 /// This function is partially responsible for generating the control flow
7736 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7737 BasicBlock *
7738 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7739   MDNode *OrigLoopID = OrigLoop->getLoopID();
7740   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
7741 
7742   // Now, compare the remaining count and if there aren't enough iterations to
7743   // execute the vectorized epilogue skip to the scalar part.
7744   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7745   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7746   LoopVectorPreHeader =
7747       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7748                  LI, nullptr, "vec.epilog.ph");
7749   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
7750                                           VecEpilogueIterationCountCheck);
7751 
7752   // Adjust the control flow taking the state info from the main loop
7753   // vectorization into account.
7754   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7755          "expected this to be saved from the previous pass.");
7756   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7757       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7758 
7759   DT->changeImmediateDominator(LoopVectorPreHeader,
7760                                EPI.MainLoopIterationCountCheck);
7761 
7762   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7763       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7764 
7765   if (EPI.SCEVSafetyCheck)
7766     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7767         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7768   if (EPI.MemSafetyCheck)
7769     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7770         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7771 
7772   DT->changeImmediateDominator(
7773       VecEpilogueIterationCountCheck,
7774       VecEpilogueIterationCountCheck->getSinglePredecessor());
7775 
7776   DT->changeImmediateDominator(LoopScalarPreHeader,
7777                                EPI.EpilogueIterationCountCheck);
7778   DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
7779 
7780   // Keep track of bypass blocks, as they feed start values to the induction
7781   // phis in the scalar loop preheader.
7782   if (EPI.SCEVSafetyCheck)
7783     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7784   if (EPI.MemSafetyCheck)
7785     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7786   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7787 
7788   // Generate a resume induction for the vector epilogue and put it in the
7789   // vector epilogue preheader
7790   Type *IdxTy = Legal->getWidestInductionType();
7791   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7792                                          LoopVectorPreHeader->getFirstNonPHI());
7793   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7794   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7795                            EPI.MainLoopIterationCountCheck);
7796 
7797   // Generate the induction variable.
7798   OldInduction = Legal->getPrimaryInduction();
7799   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7800   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7801   Value *StartIdx = EPResumeVal;
7802   Induction =
7803       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7804                               getDebugLocFromInstOrOperands(OldInduction));
7805 
7806   // Generate induction resume values. These variables save the new starting
7807   // indexes for the scalar loop. They are used to test if there are any tail
7808   // iterations left once the vector loop has completed.
7809   // Note that when the vectorized epilogue is skipped due to iteration count
7810   // check, then the resume value for the induction variable comes from
7811   // the trip count of the main vector loop, hence passing the AdditionalBypass
7812   // argument.
7813   createInductionResumeValues(Lp, CountRoundDown,
7814                               {VecEpilogueIterationCountCheck,
7815                                EPI.VectorTripCount} /* AdditionalBypass */);
7816 
7817   AddRuntimeUnrollDisableMetaData(Lp);
7818   return completeLoopSkeleton(Lp, OrigLoopID);
7819 }
7820 
7821 BasicBlock *
7822 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7823     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
7824 
7825   assert(EPI.TripCount &&
7826          "Expected trip count to have been safed in the first pass.");
7827   assert(
7828       (!isa<Instruction>(EPI.TripCount) ||
7829        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7830       "saved trip count does not dominate insertion point.");
7831   Value *TC = EPI.TripCount;
7832   IRBuilder<> Builder(Insert->getTerminator());
7833   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7834 
7835   // Generate code to check if the loop's trip count is less than VF * UF of the
7836   // vector epilogue loop.
7837   auto P =
7838       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7839 
7840   Value *CheckMinIters = Builder.CreateICmp(
7841       P, Count,
7842       ConstantInt::get(Count->getType(),
7843                        EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
7844       "min.epilog.iters.check");
7845 
7846   ReplaceInstWithInst(
7847       Insert->getTerminator(),
7848       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7849 
7850   LoopBypassBlocks.push_back(Insert);
7851   return Insert;
7852 }
7853 
7854 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7855   LLVM_DEBUG({
7856     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7857            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7858            << ", Main Loop UF:" << EPI.MainLoopUF
7859            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7860            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7861   });
7862 }
7863 
7864 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7865   DEBUG_WITH_TYPE(VerboseDebug, {
7866     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
7867   });
7868 }
7869 
7870 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7871     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7872   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7873   bool PredicateAtRangeStart = Predicate(Range.Start);
7874 
7875   for (ElementCount TmpVF = Range.Start * 2;
7876        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7877     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7878       Range.End = TmpVF;
7879       break;
7880     }
7881 
7882   return PredicateAtRangeStart;
7883 }
7884 
7885 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7886 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7887 /// of VF's starting at a given VF and extending it as much as possible. Each
7888 /// vectorization decision can potentially shorten this sub-range during
7889 /// buildVPlan().
7890 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7891                                            ElementCount MaxVF) {
7892   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7893   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7894     VFRange SubRange = {VF, MaxVFPlusOne};
7895     VPlans.push_back(buildVPlan(SubRange));
7896     VF = SubRange.End;
7897   }
7898 }
7899 
7900 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7901                                          VPlanPtr &Plan) {
7902   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7903 
7904   // Look for cached value.
7905   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7906   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7907   if (ECEntryIt != EdgeMaskCache.end())
7908     return ECEntryIt->second;
7909 
7910   VPValue *SrcMask = createBlockInMask(Src, Plan);
7911 
7912   // The terminator has to be a branch inst!
7913   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7914   assert(BI && "Unexpected terminator found");
7915 
7916   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7917     return EdgeMaskCache[Edge] = SrcMask;
7918 
7919   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
7920   assert(EdgeMask && "No Edge Mask found for condition");
7921 
7922   if (BI->getSuccessor(0) != Dst)
7923     EdgeMask = Builder.createNot(EdgeMask);
7924 
7925   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7926     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7927 
7928   return EdgeMaskCache[Edge] = EdgeMask;
7929 }
7930 
7931 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7932   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7933 
7934   // Look for cached value.
7935   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7936   if (BCEntryIt != BlockMaskCache.end())
7937     return BCEntryIt->second;
7938 
7939   // All-one mask is modelled as no-mask following the convention for masked
7940   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7941   VPValue *BlockMask = nullptr;
7942 
7943   if (OrigLoop->getHeader() == BB) {
7944     if (!CM.blockNeedsPredication(BB))
7945       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7946 
7947     // Create the block in mask as the first non-phi instruction in the block.
7948     VPBuilder::InsertPointGuard Guard(Builder);
7949     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
7950     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
7951 
7952     // Introduce the early-exit compare IV <= BTC to form header block mask.
7953     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7954     // Start by constructing the desired canonical IV.
7955     VPValue *IV = nullptr;
7956     if (Legal->getPrimaryInduction())
7957       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
7958     else {
7959       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7960       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
7961       IV = IVRecipe->getVPValue();
7962     }
7963     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7964     bool TailFolded = !CM.isScalarEpilogueAllowed();
7965 
7966     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7967       // While ActiveLaneMask is a binary op that consumes the loop tripcount
7968       // as a second argument, we only pass the IV here and extract the
7969       // tripcount from the transform state where codegen of the VP instructions
7970       // happen.
7971       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7972     } else {
7973       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7974     }
7975     return BlockMaskCache[BB] = BlockMask;
7976   }
7977 
7978   // This is the block mask. We OR all incoming edges.
7979   for (auto *Predecessor : predecessors(BB)) {
7980     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7981     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7982       return BlockMaskCache[BB] = EdgeMask;
7983 
7984     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7985       BlockMask = EdgeMask;
7986       continue;
7987     }
7988 
7989     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7990   }
7991 
7992   return BlockMaskCache[BB] = BlockMask;
7993 }
7994 
7995 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7996                                                 VPlanPtr &Plan) {
7997   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7998          "Must be called with either a load or store");
7999 
8000   auto willWiden = [&](ElementCount VF) -> bool {
8001     if (VF.isScalar())
8002       return false;
8003     LoopVectorizationCostModel::InstWidening Decision =
8004         CM.getWideningDecision(I, VF);
8005     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8006            "CM decision should be taken at this point.");
8007     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8008       return true;
8009     if (CM.isScalarAfterVectorization(I, VF) ||
8010         CM.isProfitableToScalarize(I, VF))
8011       return false;
8012     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8013   };
8014 
8015   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8016     return nullptr;
8017 
8018   VPValue *Mask = nullptr;
8019   if (Legal->isMaskRequired(I))
8020     Mask = createBlockInMask(I->getParent(), Plan);
8021 
8022   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
8023   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8024     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
8025 
8026   StoreInst *Store = cast<StoreInst>(I);
8027   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
8028   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
8029 }
8030 
8031 VPWidenIntOrFpInductionRecipe *
8032 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
8033   // Check if this is an integer or fp induction. If so, build the recipe that
8034   // produces its scalar and vector values.
8035   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8036   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8037       II.getKind() == InductionDescriptor::IK_FpInduction)
8038     return new VPWidenIntOrFpInductionRecipe(Phi);
8039 
8040   return nullptr;
8041 }
8042 
8043 VPWidenIntOrFpInductionRecipe *
8044 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
8045                                                 VFRange &Range) const {
8046   // Optimize the special case where the source is a constant integer
8047   // induction variable. Notice that we can only optimize the 'trunc' case
8048   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8049   // (c) other casts depend on pointer size.
8050 
8051   // Determine whether \p K is a truncation based on an induction variable that
8052   // can be optimized.
8053   auto isOptimizableIVTruncate =
8054       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8055     return [=](ElementCount VF) -> bool {
8056       return CM.isOptimizableIVTruncate(K, VF);
8057     };
8058   };
8059 
8060   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8061           isOptimizableIVTruncate(I), Range))
8062     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8063                                              I);
8064   return nullptr;
8065 }
8066 
8067 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
8068   // We know that all PHIs in non-header blocks are converted into selects, so
8069   // we don't have to worry about the insertion order and we can just use the
8070   // builder. At this point we generate the predication tree. There may be
8071   // duplications since this is a simple recursive scan, but future
8072   // optimizations will clean it up.
8073 
8074   SmallVector<VPValue *, 2> Operands;
8075   unsigned NumIncoming = Phi->getNumIncomingValues();
8076   for (unsigned In = 0; In < NumIncoming; In++) {
8077     VPValue *EdgeMask =
8078       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8079     assert((EdgeMask || NumIncoming == 1) &&
8080            "Multiple predecessors with one having a full mask");
8081     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
8082     if (EdgeMask)
8083       Operands.push_back(EdgeMask);
8084   }
8085   return new VPBlendRecipe(Phi, Operands);
8086 }
8087 
8088 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
8089                                                    VPlan &Plan) const {
8090 
8091   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8092       [this, CI](ElementCount VF) {
8093         return CM.isScalarWithPredication(CI, VF);
8094       },
8095       Range);
8096 
8097   if (IsPredicated)
8098     return nullptr;
8099 
8100   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8101   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8102              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8103              ID == Intrinsic::pseudoprobe))
8104     return nullptr;
8105 
8106   auto willWiden = [&](ElementCount VF) -> bool {
8107     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8108     // The following case may be scalarized depending on the VF.
8109     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8110     // version of the instruction.
8111     // Is it beneficial to perform intrinsic call compared to lib call?
8112     bool NeedToScalarize = false;
8113     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8114     bool UseVectorIntrinsic =
8115         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
8116     return UseVectorIntrinsic || !NeedToScalarize;
8117   };
8118 
8119   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8120     return nullptr;
8121 
8122   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
8123 }
8124 
8125 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8126   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8127          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8128   // Instruction should be widened, unless it is scalar after vectorization,
8129   // scalarization is profitable or it is predicated.
8130   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8131     return CM.isScalarAfterVectorization(I, VF) ||
8132            CM.isProfitableToScalarize(I, VF) ||
8133            CM.isScalarWithPredication(I, VF);
8134   };
8135   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8136                                                              Range);
8137 }
8138 
8139 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
8140   auto IsVectorizableOpcode = [](unsigned Opcode) {
8141     switch (Opcode) {
8142     case Instruction::Add:
8143     case Instruction::And:
8144     case Instruction::AShr:
8145     case Instruction::BitCast:
8146     case Instruction::FAdd:
8147     case Instruction::FCmp:
8148     case Instruction::FDiv:
8149     case Instruction::FMul:
8150     case Instruction::FNeg:
8151     case Instruction::FPExt:
8152     case Instruction::FPToSI:
8153     case Instruction::FPToUI:
8154     case Instruction::FPTrunc:
8155     case Instruction::FRem:
8156     case Instruction::FSub:
8157     case Instruction::ICmp:
8158     case Instruction::IntToPtr:
8159     case Instruction::LShr:
8160     case Instruction::Mul:
8161     case Instruction::Or:
8162     case Instruction::PtrToInt:
8163     case Instruction::SDiv:
8164     case Instruction::Select:
8165     case Instruction::SExt:
8166     case Instruction::Shl:
8167     case Instruction::SIToFP:
8168     case Instruction::SRem:
8169     case Instruction::Sub:
8170     case Instruction::Trunc:
8171     case Instruction::UDiv:
8172     case Instruction::UIToFP:
8173     case Instruction::URem:
8174     case Instruction::Xor:
8175     case Instruction::ZExt:
8176       return true;
8177     }
8178     return false;
8179   };
8180 
8181   if (!IsVectorizableOpcode(I->getOpcode()))
8182     return nullptr;
8183 
8184   // Success: widen this instruction.
8185   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
8186 }
8187 
8188 VPBasicBlock *VPRecipeBuilder::handleReplication(
8189     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8190     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
8191     VPlanPtr &Plan) {
8192   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8193       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8194       Range);
8195 
8196   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8197       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
8198       Range);
8199 
8200   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8201                                        IsUniform, IsPredicated);
8202   setRecipe(I, Recipe);
8203   Plan->addVPValue(I, Recipe);
8204 
8205   // Find if I uses a predicated instruction. If so, it will use its scalar
8206   // value. Avoid hoisting the insert-element which packs the scalar value into
8207   // a vector value, as that happens iff all users use the vector value.
8208   for (auto &Op : I->operands())
8209     if (auto *PredInst = dyn_cast<Instruction>(Op))
8210       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
8211         PredInst2Recipe[PredInst]->setAlsoPack(false);
8212 
8213   // Finalize the recipe for Instr, first if it is not predicated.
8214   if (!IsPredicated) {
8215     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8216     VPBB->appendRecipe(Recipe);
8217     return VPBB;
8218   }
8219   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8220   assert(VPBB->getSuccessors().empty() &&
8221          "VPBB has successors when handling predicated replication.");
8222   // Record predicated instructions for above packing optimizations.
8223   PredInst2Recipe[I] = Recipe;
8224   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8225   VPBlockUtils::insertBlockAfter(Region, VPBB);
8226   auto *RegSucc = new VPBasicBlock();
8227   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8228   return RegSucc;
8229 }
8230 
8231 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8232                                                       VPRecipeBase *PredRecipe,
8233                                                       VPlanPtr &Plan) {
8234   // Instructions marked for predication are replicated and placed under an
8235   // if-then construct to prevent side-effects.
8236 
8237   // Generate recipes to compute the block mask for this region.
8238   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8239 
8240   // Build the triangular if-then region.
8241   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8242   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8243   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8244   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8245   auto *PHIRecipe = Instr->getType()->isVoidTy()
8246                         ? nullptr
8247                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8248   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8249   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8250   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8251 
8252   // Note: first set Entry as region entry and then connect successors starting
8253   // from it in order, to propagate the "parent" of each VPBasicBlock.
8254   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8255   VPBlockUtils::connectBlocks(Pred, Exit);
8256 
8257   return Region;
8258 }
8259 
8260 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8261                                                       VFRange &Range,
8262                                                       VPlanPtr &Plan) {
8263   // First, check for specific widening recipes that deal with calls, memory
8264   // operations, inductions and Phi nodes.
8265   if (auto *CI = dyn_cast<CallInst>(Instr))
8266     return tryToWidenCall(CI, Range, *Plan);
8267 
8268   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8269     return tryToWidenMemory(Instr, Range, Plan);
8270 
8271   VPRecipeBase *Recipe;
8272   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8273     if (Phi->getParent() != OrigLoop->getHeader())
8274       return tryToBlend(Phi, Plan);
8275     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
8276       return Recipe;
8277     return new VPWidenPHIRecipe(Phi);
8278   }
8279 
8280   if (isa<TruncInst>(Instr) &&
8281       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
8282     return Recipe;
8283 
8284   if (!shouldWiden(Instr, Range))
8285     return nullptr;
8286 
8287   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8288     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
8289                                 OrigLoop);
8290 
8291   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8292     bool InvariantCond =
8293         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8294     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
8295                                    InvariantCond);
8296   }
8297 
8298   return tryToWiden(Instr, *Plan);
8299 }
8300 
8301 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8302                                                         ElementCount MaxVF) {
8303   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8304 
8305   // Collect instructions from the original loop that will become trivially dead
8306   // in the vectorized loop. We don't need to vectorize these instructions. For
8307   // example, original induction update instructions can become dead because we
8308   // separately emit induction "steps" when generating code for the new loop.
8309   // Similarly, we create a new latch condition when setting up the structure
8310   // of the new loop, so the old one can become dead.
8311   SmallPtrSet<Instruction *, 4> DeadInstructions;
8312   collectTriviallyDeadInstructions(DeadInstructions);
8313 
8314   // Add assume instructions we need to drop to DeadInstructions, to prevent
8315   // them from being added to the VPlan.
8316   // TODO: We only need to drop assumes in blocks that get flattend. If the
8317   // control flow is preserved, we should keep them.
8318   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8319   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8320 
8321   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8322   // Dead instructions do not need sinking. Remove them from SinkAfter.
8323   for (Instruction *I : DeadInstructions)
8324     SinkAfter.erase(I);
8325 
8326   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8327   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8328     VFRange SubRange = {VF, MaxVFPlusOne};
8329     VPlans.push_back(
8330         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8331     VF = SubRange.End;
8332   }
8333 }
8334 
8335 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8336     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8337     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
8338 
8339   // Hold a mapping from predicated instructions to their recipes, in order to
8340   // fix their AlsoPack behavior if a user is determined to replicate and use a
8341   // scalar instead of vector value.
8342   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
8343 
8344   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8345 
8346   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8347 
8348   // ---------------------------------------------------------------------------
8349   // Pre-construction: record ingredients whose recipes we'll need to further
8350   // process after constructing the initial VPlan.
8351   // ---------------------------------------------------------------------------
8352 
8353   // Mark instructions we'll need to sink later and their targets as
8354   // ingredients whose recipe we'll need to record.
8355   for (auto &Entry : SinkAfter) {
8356     RecipeBuilder.recordRecipeOf(Entry.first);
8357     RecipeBuilder.recordRecipeOf(Entry.second);
8358   }
8359   for (auto &Reduction : CM.getInLoopReductionChains()) {
8360     PHINode *Phi = Reduction.first;
8361     RecurrenceDescriptor::RecurrenceKind Kind =
8362         Legal->getReductionVars()[Phi].getRecurrenceKind();
8363     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8364 
8365     RecipeBuilder.recordRecipeOf(Phi);
8366     for (auto &R : ReductionOperations) {
8367       RecipeBuilder.recordRecipeOf(R);
8368       // For min/max reducitons, where we have a pair of icmp/select, we also
8369       // need to record the ICmp recipe, so it can be removed later.
8370       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8371           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8372         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8373       }
8374     }
8375   }
8376 
8377   // For each interleave group which is relevant for this (possibly trimmed)
8378   // Range, add it to the set of groups to be later applied to the VPlan and add
8379   // placeholders for its members' Recipes which we'll be replacing with a
8380   // single VPInterleaveRecipe.
8381   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8382     auto applyIG = [IG, this](ElementCount VF) -> bool {
8383       return (VF.isVector() && // Query is illegal for VF == 1
8384               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8385                   LoopVectorizationCostModel::CM_Interleave);
8386     };
8387     if (!getDecisionAndClampRange(applyIG, Range))
8388       continue;
8389     InterleaveGroups.insert(IG);
8390     for (unsigned i = 0; i < IG->getFactor(); i++)
8391       if (Instruction *Member = IG->getMember(i))
8392         RecipeBuilder.recordRecipeOf(Member);
8393   };
8394 
8395   // ---------------------------------------------------------------------------
8396   // Build initial VPlan: Scan the body of the loop in a topological order to
8397   // visit each basic block after having visited its predecessor basic blocks.
8398   // ---------------------------------------------------------------------------
8399 
8400   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
8401   auto Plan = std::make_unique<VPlan>();
8402   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
8403   Plan->setEntry(VPBB);
8404 
8405   // Scan the body of the loop in a topological order to visit each basic block
8406   // after having visited its predecessor basic blocks.
8407   LoopBlocksDFS DFS(OrigLoop);
8408   DFS.perform(LI);
8409 
8410   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8411     // Relevant instructions from basic block BB will be grouped into VPRecipe
8412     // ingredients and fill a new VPBasicBlock.
8413     unsigned VPBBsForBB = 0;
8414     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8415     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
8416     VPBB = FirstVPBBForBB;
8417     Builder.setInsertPoint(VPBB);
8418 
8419     // Introduce each ingredient into VPlan.
8420     // TODO: Model and preserve debug instrinsics in VPlan.
8421     for (Instruction &I : BB->instructionsWithoutDebug()) {
8422       Instruction *Instr = &I;
8423 
8424       // First filter out irrelevant instructions, to ensure no recipes are
8425       // built for them.
8426       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8427         continue;
8428 
8429       if (auto Recipe =
8430               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
8431         for (auto *Def : Recipe->definedValues()) {
8432           auto *UV = Def->getUnderlyingValue();
8433           Plan->addVPValue(UV, Def);
8434         }
8435 
8436         RecipeBuilder.setRecipe(Instr, Recipe);
8437         VPBB->appendRecipe(Recipe);
8438         continue;
8439       }
8440 
8441       // Otherwise, if all widening options failed, Instruction is to be
8442       // replicated. This may create a successor for VPBB.
8443       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
8444           Instr, Range, VPBB, PredInst2Recipe, Plan);
8445       if (NextVPBB != VPBB) {
8446         VPBB = NextVPBB;
8447         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8448                                     : "");
8449       }
8450     }
8451   }
8452 
8453   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8454   // may also be empty, such as the last one VPBB, reflecting original
8455   // basic-blocks with no recipes.
8456   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8457   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
8458   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8459   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
8460   delete PreEntry;
8461 
8462   // ---------------------------------------------------------------------------
8463   // Transform initial VPlan: Apply previously taken decisions, in order, to
8464   // bring the VPlan to its final state.
8465   // ---------------------------------------------------------------------------
8466 
8467   // Apply Sink-After legal constraints.
8468   for (auto &Entry : SinkAfter) {
8469     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8470     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8471     Sink->moveAfter(Target);
8472   }
8473 
8474   // Interleave memory: for each Interleave Group we marked earlier as relevant
8475   // for this VPlan, replace the Recipes widening its memory instructions with a
8476   // single VPInterleaveRecipe at its insertion point.
8477   for (auto IG : InterleaveGroups) {
8478     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8479         RecipeBuilder.getRecipe(IG->getInsertPos()));
8480     SmallVector<VPValue *, 4> StoredValues;
8481     for (unsigned i = 0; i < IG->getFactor(); ++i)
8482       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
8483         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
8484 
8485     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8486                                         Recipe->getMask());
8487     VPIG->insertBefore(Recipe);
8488     unsigned J = 0;
8489     for (unsigned i = 0; i < IG->getFactor(); ++i)
8490       if (Instruction *Member = IG->getMember(i)) {
8491         if (!Member->getType()->isVoidTy()) {
8492           VPValue *OriginalV = Plan->getVPValue(Member);
8493           Plan->removeVPValueFor(Member);
8494           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8495           J++;
8496         }
8497         RecipeBuilder.getRecipe(Member)->eraseFromParent();
8498       }
8499   }
8500 
8501   // Adjust the recipes for any inloop reductions.
8502   if (Range.Start.isVector())
8503     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
8504 
8505   // Finally, if tail is folded by masking, introduce selects between the phi
8506   // and the live-out instruction of each reduction, at the end of the latch.
8507   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
8508     Builder.setInsertPoint(VPBB);
8509     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
8510     for (auto &Reduction : Legal->getReductionVars()) {
8511       if (CM.isInLoopReduction(Reduction.first))
8512         continue;
8513       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
8514       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
8515       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
8516     }
8517   }
8518 
8519   std::string PlanName;
8520   raw_string_ostream RSO(PlanName);
8521   ElementCount VF = Range.Start;
8522   Plan->addVF(VF);
8523   RSO << "Initial VPlan for VF={" << VF;
8524   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
8525     Plan->addVF(VF);
8526     RSO << "," << VF;
8527   }
8528   RSO << "},UF>=1";
8529   RSO.flush();
8530   Plan->setName(PlanName);
8531 
8532   return Plan;
8533 }
8534 
8535 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8536   // Outer loop handling: They may require CFG and instruction level
8537   // transformations before even evaluating whether vectorization is profitable.
8538   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8539   // the vectorization pipeline.
8540   assert(!OrigLoop->isInnermost());
8541   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8542 
8543   // Create new empty VPlan
8544   auto Plan = std::make_unique<VPlan>();
8545 
8546   // Build hierarchical CFG
8547   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8548   HCFGBuilder.buildHierarchicalCFG();
8549 
8550   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
8551        VF *= 2)
8552     Plan->addVF(VF);
8553 
8554   if (EnableVPlanPredication) {
8555     VPlanPredicator VPP(*Plan);
8556     VPP.predicate();
8557 
8558     // Avoid running transformation to recipes until masked code generation in
8559     // VPlan-native path is in place.
8560     return Plan;
8561   }
8562 
8563   SmallPtrSet<Instruction *, 1> DeadInstructions;
8564   VPlanTransforms::VPInstructionsToVPRecipes(
8565       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
8566   return Plan;
8567 }
8568 
8569 // Adjust the recipes for any inloop reductions. The chain of instructions
8570 // leading from the loop exit instr to the phi need to be converted to
8571 // reductions, with one operand being vector and the other being the scalar
8572 // reduction chain.
8573 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
8574     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
8575   for (auto &Reduction : CM.getInLoopReductionChains()) {
8576     PHINode *Phi = Reduction.first;
8577     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8578     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8579 
8580     // ReductionOperations are orders top-down from the phi's use to the
8581     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
8582     // which of the two operands will remain scalar and which will be reduced.
8583     // For minmax the chain will be the select instructions.
8584     Instruction *Chain = Phi;
8585     for (Instruction *R : ReductionOperations) {
8586       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
8587       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
8588 
8589       VPValue *ChainOp = Plan->getVPValue(Chain);
8590       unsigned FirstOpId;
8591       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8592           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8593         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
8594                "Expected to replace a VPWidenSelectSC");
8595         FirstOpId = 1;
8596       } else {
8597         assert(isa<VPWidenRecipe>(WidenRecipe) &&
8598                "Expected to replace a VPWidenSC");
8599         FirstOpId = 0;
8600       }
8601       unsigned VecOpId =
8602           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
8603       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
8604 
8605       auto *CondOp = CM.foldTailByMasking()
8606                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
8607                          : nullptr;
8608       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
8609           &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
8610       WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
8611       Plan->removeVPValueFor(R);
8612       Plan->addVPValue(R, RedRecipe);
8613       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
8614       WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
8615       WidenRecipe->eraseFromParent();
8616 
8617       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8618           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8619         VPRecipeBase *CompareRecipe =
8620             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
8621         assert(isa<VPWidenRecipe>(CompareRecipe) &&
8622                "Expected to replace a VPWidenSC");
8623         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
8624                "Expected no remaining users");
8625         CompareRecipe->eraseFromParent();
8626       }
8627       Chain = R;
8628     }
8629   }
8630 }
8631 
8632 Value* LoopVectorizationPlanner::VPCallbackILV::
8633 getOrCreateVectorValues(Value *V, unsigned Part) {
8634       return ILV.getOrCreateVectorValue(V, Part);
8635 }
8636 
8637 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
8638     Value *V, const VPIteration &Instance) {
8639   return ILV.getOrCreateScalarValue(V, Instance);
8640 }
8641 
8642 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
8643                                VPSlotTracker &SlotTracker) const {
8644   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
8645   IG->getInsertPos()->printAsOperand(O, false);
8646   O << ", ";
8647   getAddr()->printAsOperand(O, SlotTracker);
8648   VPValue *Mask = getMask();
8649   if (Mask) {
8650     O << ", ";
8651     Mask->printAsOperand(O, SlotTracker);
8652   }
8653   for (unsigned i = 0; i < IG->getFactor(); ++i)
8654     if (Instruction *I = IG->getMember(i))
8655       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
8656 }
8657 
8658 void VPWidenCallRecipe::execute(VPTransformState &State) {
8659   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
8660                                   *this, State);
8661 }
8662 
8663 void VPWidenSelectRecipe::execute(VPTransformState &State) {
8664   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
8665                                     this, *this, InvariantCond, State);
8666 }
8667 
8668 void VPWidenRecipe::execute(VPTransformState &State) {
8669   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
8670 }
8671 
8672 void VPWidenGEPRecipe::execute(VPTransformState &State) {
8673   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
8674                       *this, State.UF, State.VF, IsPtrLoopInvariant,
8675                       IsIndexLoopInvariant, State);
8676 }
8677 
8678 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
8679   assert(!State.Instance && "Int or FP induction being replicated.");
8680   State.ILV->widenIntOrFpInduction(IV, Trunc);
8681 }
8682 
8683 void VPWidenPHIRecipe::execute(VPTransformState &State) {
8684   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
8685 }
8686 
8687 void VPBlendRecipe::execute(VPTransformState &State) {
8688   State.ILV->setDebugLocFromInst(State.Builder, Phi);
8689   // We know that all PHIs in non-header blocks are converted into
8690   // selects, so we don't have to worry about the insertion order and we
8691   // can just use the builder.
8692   // At this point we generate the predication tree. There may be
8693   // duplications since this is a simple recursive scan, but future
8694   // optimizations will clean it up.
8695 
8696   unsigned NumIncoming = getNumIncomingValues();
8697 
8698   // Generate a sequence of selects of the form:
8699   // SELECT(Mask3, In3,
8700   //        SELECT(Mask2, In2,
8701   //               SELECT(Mask1, In1,
8702   //                      In0)))
8703   // Note that Mask0 is never used: lanes for which no path reaches this phi and
8704   // are essentially undef are taken from In0.
8705   InnerLoopVectorizer::VectorParts Entry(State.UF);
8706   for (unsigned In = 0; In < NumIncoming; ++In) {
8707     for (unsigned Part = 0; Part < State.UF; ++Part) {
8708       // We might have single edge PHIs (blocks) - use an identity
8709       // 'select' for the first PHI operand.
8710       Value *In0 = State.get(getIncomingValue(In), Part);
8711       if (In == 0)
8712         Entry[Part] = In0; // Initialize with the first incoming value.
8713       else {
8714         // Select between the current value and the previous incoming edge
8715         // based on the incoming mask.
8716         Value *Cond = State.get(getMask(In), Part);
8717         Entry[Part] =
8718             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8719       }
8720     }
8721   }
8722   for (unsigned Part = 0; Part < State.UF; ++Part)
8723     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8724 }
8725 
8726 void VPInterleaveRecipe::execute(VPTransformState &State) {
8727   assert(!State.Instance && "Interleave group being replicated.");
8728   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
8729                                       getStoredValues(), getMask());
8730 }
8731 
8732 void VPReductionRecipe::execute(VPTransformState &State) {
8733   assert(!State.Instance && "Reduction being replicated.");
8734   for (unsigned Part = 0; Part < State.UF; ++Part) {
8735     RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind();
8736     Value *NewVecOp = State.get(getVecOp(), Part);
8737     if (VPValue *Cond = getCondOp()) {
8738       Value *NewCond = State.get(Cond, Part);
8739       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
8740       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
8741           Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType());
8742       Constant *IdenVec =
8743           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
8744       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
8745       NewVecOp = Select;
8746     }
8747     Value *NewRed =
8748         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
8749     Value *PrevInChain = State.get(getChainOp(), Part);
8750     Value *NextInChain;
8751     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8752         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8753       NextInChain =
8754           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
8755                          NewRed, PrevInChain);
8756     } else {
8757       NextInChain = State.Builder.CreateBinOp(
8758           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
8759           PrevInChain);
8760     }
8761     State.set(this, getUnderlyingInstr(), NextInChain, Part);
8762   }
8763 }
8764 
8765 void VPReplicateRecipe::execute(VPTransformState &State) {
8766   if (State.Instance) { // Generate a single instance.
8767     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
8768     State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
8769                                     *State.Instance, IsPredicated, State);
8770     // Insert scalar instance packing it into a vector.
8771     if (AlsoPack && State.VF.isVector()) {
8772       // If we're constructing lane 0, initialize to start from undef.
8773       if (State.Instance->Lane == 0) {
8774         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8775         Value *Undef = UndefValue::get(
8776             VectorType::get(getUnderlyingValue()->getType(), State.VF));
8777         State.ValueMap.setVectorValue(getUnderlyingInstr(),
8778                                       State.Instance->Part, Undef);
8779       }
8780       State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
8781                                            *State.Instance);
8782     }
8783     return;
8784   }
8785 
8786   // Generate scalar instances for all VF lanes of all UF parts, unless the
8787   // instruction is uniform inwhich case generate only the first lane for each
8788   // of the UF parts.
8789   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8790   assert((!State.VF.isScalable() || IsUniform) &&
8791          "Can't scalarize a scalable vector");
8792   for (unsigned Part = 0; Part < State.UF; ++Part)
8793     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8794       State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
8795                                       IsPredicated, State);
8796 }
8797 
8798 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8799   assert(State.Instance && "Branch on Mask works only on single instance.");
8800 
8801   unsigned Part = State.Instance->Part;
8802   unsigned Lane = State.Instance->Lane;
8803 
8804   Value *ConditionBit = nullptr;
8805   VPValue *BlockInMask = getMask();
8806   if (BlockInMask) {
8807     ConditionBit = State.get(BlockInMask, Part);
8808     if (ConditionBit->getType()->isVectorTy())
8809       ConditionBit = State.Builder.CreateExtractElement(
8810           ConditionBit, State.Builder.getInt32(Lane));
8811   } else // Block in mask is all-one.
8812     ConditionBit = State.Builder.getTrue();
8813 
8814   // Replace the temporary unreachable terminator with a new conditional branch,
8815   // whose two destinations will be set later when they are created.
8816   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8817   assert(isa<UnreachableInst>(CurrentTerminator) &&
8818          "Expected to replace unreachable terminator with conditional branch.");
8819   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8820   CondBr->setSuccessor(0, nullptr);
8821   ReplaceInstWithInst(CurrentTerminator, CondBr);
8822 }
8823 
8824 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8825   assert(State.Instance && "Predicated instruction PHI works per instance.");
8826   Instruction *ScalarPredInst =
8827       cast<Instruction>(State.get(getOperand(0), *State.Instance));
8828   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8829   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8830   assert(PredicatingBB && "Predicated block has no single predecessor.");
8831 
8832   // By current pack/unpack logic we need to generate only a single phi node: if
8833   // a vector value for the predicated instruction exists at this point it means
8834   // the instruction has vector users only, and a phi for the vector value is
8835   // needed. In this case the recipe of the predicated instruction is marked to
8836   // also do that packing, thereby "hoisting" the insert-element sequence.
8837   // Otherwise, a phi node for the scalar value is needed.
8838   unsigned Part = State.Instance->Part;
8839   Instruction *PredInst =
8840       cast<Instruction>(getOperand(0)->getUnderlyingValue());
8841   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8842     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8843     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8844     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8845     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8846     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8847     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8848   } else {
8849     Type *PredInstType = PredInst->getType();
8850     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8851     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8852     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8853     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8854   }
8855 }
8856 
8857 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8858   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
8859   State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
8860                                         StoredValue ? nullptr : getVPValue(),
8861                                         getAddr(), StoredValue, getMask());
8862 }
8863 
8864 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8865 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8866 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8867 // for predication.
8868 static ScalarEpilogueLowering getScalarEpilogueLowering(
8869     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8870     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8871     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8872     LoopVectorizationLegality &LVL) {
8873   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8874   // don't look at hints or options, and don't request a scalar epilogue.
8875   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8876   // LoopAccessInfo (due to code dependency and not being able to reliably get
8877   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8878   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8879   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8880   // back to the old way and vectorize with versioning when forced. See D81345.)
8881   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8882                                                       PGSOQueryType::IRPass) &&
8883                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8884     return CM_ScalarEpilogueNotAllowedOptSize;
8885 
8886   // 2) If set, obey the directives
8887   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
8888     switch (PreferPredicateOverEpilogue) {
8889     case PreferPredicateTy::ScalarEpilogue:
8890       return CM_ScalarEpilogueAllowed;
8891     case PreferPredicateTy::PredicateElseScalarEpilogue:
8892       return CM_ScalarEpilogueNotNeededUsePredicate;
8893     case PreferPredicateTy::PredicateOrDontVectorize:
8894       return CM_ScalarEpilogueNotAllowedUsePredicate;
8895     };
8896   }
8897 
8898   // 3) If set, obey the hints
8899   switch (Hints.getPredicate()) {
8900   case LoopVectorizeHints::FK_Enabled:
8901     return CM_ScalarEpilogueNotNeededUsePredicate;
8902   case LoopVectorizeHints::FK_Disabled:
8903     return CM_ScalarEpilogueAllowed;
8904   };
8905 
8906   // 4) if the TTI hook indicates this is profitable, request predication.
8907   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8908                                        LVL.getLAI()))
8909     return CM_ScalarEpilogueNotNeededUsePredicate;
8910 
8911   return CM_ScalarEpilogueAllowed;
8912 }
8913 
8914 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
8915                            unsigned Part) {
8916   set(Def, V, Part);
8917   ILV->setVectorValue(IRDef, Part, V);
8918 }
8919 
8920 // Process the loop in the VPlan-native vectorization path. This path builds
8921 // VPlan upfront in the vectorization pipeline, which allows to apply
8922 // VPlan-to-VPlan transformations from the very beginning without modifying the
8923 // input LLVM IR.
8924 static bool processLoopInVPlanNativePath(
8925     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8926     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8927     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8928     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8929     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8930 
8931   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
8932     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8933     return false;
8934   }
8935   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8936   Function *F = L->getHeader()->getParent();
8937   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8938 
8939   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8940       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8941 
8942   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8943                                 &Hints, IAI);
8944   // Use the planner for outer loop vectorization.
8945   // TODO: CM is not used at this point inside the planner. Turn CM into an
8946   // optional argument if we don't need it in the future.
8947   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8948 
8949   // Get user vectorization factor.
8950   ElementCount UserVF = Hints.getWidth();
8951 
8952   // Plan how to best vectorize, return the best VF and its cost.
8953   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
8954 
8955   // If we are stress testing VPlan builds, do not attempt to generate vector
8956   // code. Masked vector code generation support will follow soon.
8957   // Also, do not attempt to vectorize if no vector code will be produced.
8958   if (VPlanBuildStressTest || EnableVPlanPredication ||
8959       VectorizationFactor::Disabled() == VF)
8960     return false;
8961 
8962   LVP.setBestPlan(VF.Width, 1);
8963 
8964   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8965                          &CM, BFI, PSI);
8966   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8967                     << L->getHeader()->getParent()->getName() << "\"\n");
8968   LVP.executePlan(LB, DT);
8969 
8970   // Mark the loop as already vectorized to avoid vectorizing again.
8971   Hints.setAlreadyVectorized();
8972 
8973   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8974   return true;
8975 }
8976 
8977 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8978     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8979                                !EnableLoopInterleaving),
8980       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8981                               !EnableLoopVectorization) {}
8982 
8983 bool LoopVectorizePass::processLoop(Loop *L) {
8984   assert((EnableVPlanNativePath || L->isInnermost()) &&
8985          "VPlan-native path is not enabled. Only process inner loops.");
8986 
8987 #ifndef NDEBUG
8988   const std::string DebugLocStr = getDebugLocString(L);
8989 #endif /* NDEBUG */
8990 
8991   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8992                     << L->getHeader()->getParent()->getName() << "\" from "
8993                     << DebugLocStr << "\n");
8994 
8995   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8996 
8997   LLVM_DEBUG(
8998       dbgs() << "LV: Loop hints:"
8999              << " force="
9000              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9001                      ? "disabled"
9002                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9003                             ? "enabled"
9004                             : "?"))
9005              << " width=" << Hints.getWidth()
9006              << " unroll=" << Hints.getInterleave() << "\n");
9007 
9008   // Function containing loop
9009   Function *F = L->getHeader()->getParent();
9010 
9011   // Looking at the diagnostic output is the only way to determine if a loop
9012   // was vectorized (other than looking at the IR or machine code), so it
9013   // is important to generate an optimization remark for each loop. Most of
9014   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9015   // generated as OptimizationRemark and OptimizationRemarkMissed are
9016   // less verbose reporting vectorized loops and unvectorized loops that may
9017   // benefit from vectorization, respectively.
9018 
9019   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9020     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9021     return false;
9022   }
9023 
9024   PredicatedScalarEvolution PSE(*SE, *L);
9025 
9026   // Check if it is legal to vectorize the loop.
9027   LoopVectorizationRequirements Requirements(*ORE);
9028   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
9029                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9030   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9031     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9032     Hints.emitRemarkWithHints();
9033     return false;
9034   }
9035 
9036   // Check the function attributes and profiles to find out if this function
9037   // should be optimized for size.
9038   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9039       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
9040 
9041   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9042   // here. They may require CFG and instruction level transformations before
9043   // even evaluating whether vectorization is profitable. Since we cannot modify
9044   // the incoming IR, we need to build VPlan upfront in the vectorization
9045   // pipeline.
9046   if (!L->isInnermost())
9047     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9048                                         ORE, BFI, PSI, Hints);
9049 
9050   assert(L->isInnermost() && "Inner loop expected.");
9051 
9052   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9053   // count by optimizing for size, to minimize overheads.
9054   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9055   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9056     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9057                       << "This loop is worth vectorizing only if no scalar "
9058                       << "iteration overheads are incurred.");
9059     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9060       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9061     else {
9062       LLVM_DEBUG(dbgs() << "\n");
9063       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9064     }
9065   }
9066 
9067   // Check the function attributes to see if implicit floats are allowed.
9068   // FIXME: This check doesn't seem possibly correct -- what if the loop is
9069   // an integer loop and the vector instructions selected are purely integer
9070   // vector instructions?
9071   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9072     reportVectorizationFailure(
9073         "Can't vectorize when the NoImplicitFloat attribute is used",
9074         "loop not vectorized due to NoImplicitFloat attribute",
9075         "NoImplicitFloat", ORE, L);
9076     Hints.emitRemarkWithHints();
9077     return false;
9078   }
9079 
9080   // Check if the target supports potentially unsafe FP vectorization.
9081   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9082   // for the target we're vectorizing for, to make sure none of the
9083   // additional fp-math flags can help.
9084   if (Hints.isPotentiallyUnsafe() &&
9085       TTI->isFPVectorizationPotentiallyUnsafe()) {
9086     reportVectorizationFailure(
9087         "Potentially unsafe FP op prevents vectorization",
9088         "loop not vectorized due to unsafe FP support.",
9089         "UnsafeFP", ORE, L);
9090     Hints.emitRemarkWithHints();
9091     return false;
9092   }
9093 
9094   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9095   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9096 
9097   // If an override option has been passed in for interleaved accesses, use it.
9098   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9099     UseInterleaved = EnableInterleavedMemAccesses;
9100 
9101   // Analyze interleaved memory accesses.
9102   if (UseInterleaved) {
9103     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9104   }
9105 
9106   // Use the cost model.
9107   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9108                                 F, &Hints, IAI);
9109   CM.collectValuesToIgnore();
9110 
9111   // Use the planner for vectorization.
9112   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
9113 
9114   // Get user vectorization factor and interleave count.
9115   ElementCount UserVF = Hints.getWidth();
9116   unsigned UserIC = Hints.getInterleave();
9117 
9118   // Plan how to best vectorize, return the best VF and its cost.
9119   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9120 
9121   VectorizationFactor VF = VectorizationFactor::Disabled();
9122   unsigned IC = 1;
9123 
9124   if (MaybeVF) {
9125     VF = *MaybeVF;
9126     // Select the interleave count.
9127     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9128   }
9129 
9130   // Identify the diagnostic messages that should be produced.
9131   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9132   bool VectorizeLoop = true, InterleaveLoop = true;
9133   if (Requirements.doesNotMeet(F, L, Hints)) {
9134     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
9135                          "requirements.\n");
9136     Hints.emitRemarkWithHints();
9137     return false;
9138   }
9139 
9140   if (VF.Width.isScalar()) {
9141     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9142     VecDiagMsg = std::make_pair(
9143         "VectorizationNotBeneficial",
9144         "the cost-model indicates that vectorization is not beneficial");
9145     VectorizeLoop = false;
9146   }
9147 
9148   if (!MaybeVF && UserIC > 1) {
9149     // Tell the user interleaving was avoided up-front, despite being explicitly
9150     // requested.
9151     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9152                          "interleaving should be avoided up front\n");
9153     IntDiagMsg = std::make_pair(
9154         "InterleavingAvoided",
9155         "Ignoring UserIC, because interleaving was avoided up front");
9156     InterleaveLoop = false;
9157   } else if (IC == 1 && UserIC <= 1) {
9158     // Tell the user interleaving is not beneficial.
9159     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9160     IntDiagMsg = std::make_pair(
9161         "InterleavingNotBeneficial",
9162         "the cost-model indicates that interleaving is not beneficial");
9163     InterleaveLoop = false;
9164     if (UserIC == 1) {
9165       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9166       IntDiagMsg.second +=
9167           " and is explicitly disabled or interleave count is set to 1";
9168     }
9169   } else if (IC > 1 && UserIC == 1) {
9170     // Tell the user interleaving is beneficial, but it explicitly disabled.
9171     LLVM_DEBUG(
9172         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9173     IntDiagMsg = std::make_pair(
9174         "InterleavingBeneficialButDisabled",
9175         "the cost-model indicates that interleaving is beneficial "
9176         "but is explicitly disabled or interleave count is set to 1");
9177     InterleaveLoop = false;
9178   }
9179 
9180   // Override IC if user provided an interleave count.
9181   IC = UserIC > 0 ? UserIC : IC;
9182 
9183   // Emit diagnostic messages, if any.
9184   const char *VAPassName = Hints.vectorizeAnalysisPassName();
9185   if (!VectorizeLoop && !InterleaveLoop) {
9186     // Do not vectorize or interleaving the loop.
9187     ORE->emit([&]() {
9188       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9189                                       L->getStartLoc(), L->getHeader())
9190              << VecDiagMsg.second;
9191     });
9192     ORE->emit([&]() {
9193       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9194                                       L->getStartLoc(), L->getHeader())
9195              << IntDiagMsg.second;
9196     });
9197     return false;
9198   } else if (!VectorizeLoop && InterleaveLoop) {
9199     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9200     ORE->emit([&]() {
9201       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9202                                         L->getStartLoc(), L->getHeader())
9203              << VecDiagMsg.second;
9204     });
9205   } else if (VectorizeLoop && !InterleaveLoop) {
9206     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9207                       << ") in " << DebugLocStr << '\n');
9208     ORE->emit([&]() {
9209       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9210                                         L->getStartLoc(), L->getHeader())
9211              << IntDiagMsg.second;
9212     });
9213   } else if (VectorizeLoop && InterleaveLoop) {
9214     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9215                       << ") in " << DebugLocStr << '\n');
9216     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9217   }
9218 
9219   LVP.setBestPlan(VF.Width, IC);
9220 
9221   using namespace ore;
9222   bool DisableRuntimeUnroll = false;
9223   MDNode *OrigLoopID = L->getLoopID();
9224 
9225   if (!VectorizeLoop) {
9226     assert(IC > 1 && "interleave count should not be 1 or 0");
9227     // If we decided that it is not legal to vectorize the loop, then
9228     // interleave it.
9229     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
9230                                BFI, PSI);
9231     LVP.executePlan(Unroller, DT);
9232 
9233     ORE->emit([&]() {
9234       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9235                                 L->getHeader())
9236              << "interleaved loop (interleaved count: "
9237              << NV("InterleaveCount", IC) << ")";
9238     });
9239   } else {
9240     // If we decided that it is *legal* to vectorize the loop, then do it.
9241 
9242     // Consider vectorizing the epilogue too if it's profitable.
9243     VectorizationFactor EpilogueVF =
9244       CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
9245     if (EpilogueVF.Width.isVector()) {
9246 
9247       // The first pass vectorizes the main loop and creates a scalar epilogue
9248       // to be vectorized by executing the plan (potentially with a different
9249       // factor) again shortly afterwards.
9250       EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
9251                                         EpilogueVF.Width.getKnownMinValue(), 1);
9252       EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
9253                                          &LVL, &CM, BFI, PSI);
9254 
9255       LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
9256       LVP.executePlan(MainILV, DT);
9257       ++LoopsVectorized;
9258 
9259       simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9260       formLCSSARecursively(*L, *DT, LI, SE);
9261 
9262       // Second pass vectorizes the epilogue and adjusts the control flow
9263       // edges from the first pass.
9264       LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
9265       EPI.MainLoopVF = EPI.EpilogueVF;
9266       EPI.MainLoopUF = EPI.EpilogueUF;
9267       EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9268                                                ORE, EPI, &LVL, &CM, BFI, PSI);
9269       LVP.executePlan(EpilogILV, DT);
9270       ++LoopsEpilogueVectorized;
9271 
9272       if (!MainILV.areSafetyChecksAdded())
9273         DisableRuntimeUnroll = true;
9274     } else {
9275       InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
9276                              &LVL, &CM, BFI, PSI);
9277       LVP.executePlan(LB, DT);
9278       ++LoopsVectorized;
9279 
9280       // Add metadata to disable runtime unrolling a scalar loop when there are
9281       // no runtime checks about strides and memory. A scalar loop that is
9282       // rarely used is not worth unrolling.
9283       if (!LB.areSafetyChecksAdded())
9284         DisableRuntimeUnroll = true;
9285     }
9286 
9287     // Report the vectorization decision.
9288     ORE->emit([&]() {
9289       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
9290                                 L->getHeader())
9291              << "vectorized loop (vectorization width: "
9292              << NV("VectorizationFactor", VF.Width)
9293              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
9294     });
9295   }
9296 
9297   Optional<MDNode *> RemainderLoopID =
9298       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
9299                                       LLVMLoopVectorizeFollowupEpilogue});
9300   if (RemainderLoopID.hasValue()) {
9301     L->setLoopID(RemainderLoopID.getValue());
9302   } else {
9303     if (DisableRuntimeUnroll)
9304       AddRuntimeUnrollDisableMetaData(L);
9305 
9306     // Mark the loop as already vectorized to avoid vectorizing again.
9307     Hints.setAlreadyVectorized();
9308   }
9309 
9310   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9311   return true;
9312 }
9313 
9314 LoopVectorizeResult LoopVectorizePass::runImpl(
9315     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
9316     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
9317     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
9318     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
9319     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
9320   SE = &SE_;
9321   LI = &LI_;
9322   TTI = &TTI_;
9323   DT = &DT_;
9324   BFI = &BFI_;
9325   TLI = TLI_;
9326   AA = &AA_;
9327   AC = &AC_;
9328   GetLAA = &GetLAA_;
9329   DB = &DB_;
9330   ORE = &ORE_;
9331   PSI = PSI_;
9332 
9333   // Don't attempt if
9334   // 1. the target claims to have no vector registers, and
9335   // 2. interleaving won't help ILP.
9336   //
9337   // The second condition is necessary because, even if the target has no
9338   // vector registers, loop vectorization may still enable scalar
9339   // interleaving.
9340   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9341       TTI->getMaxInterleaveFactor(1) < 2)
9342     return LoopVectorizeResult(false, false);
9343 
9344   bool Changed = false, CFGChanged = false;
9345 
9346   // The vectorizer requires loops to be in simplified form.
9347   // Since simplification may add new inner loops, it has to run before the
9348   // legality and profitability checks. This means running the loop vectorizer
9349   // will simplify all loops, regardless of whether anything end up being
9350   // vectorized.
9351   for (auto &L : *LI)
9352     Changed |= CFGChanged |=
9353         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9354 
9355   // Build up a worklist of inner-loops to vectorize. This is necessary as
9356   // the act of vectorizing or partially unrolling a loop creates new loops
9357   // and can invalidate iterators across the loops.
9358   SmallVector<Loop *, 8> Worklist;
9359 
9360   for (Loop *L : *LI)
9361     collectSupportedLoops(*L, LI, ORE, Worklist);
9362 
9363   LoopsAnalyzed += Worklist.size();
9364 
9365   // Now walk the identified inner loops.
9366   while (!Worklist.empty()) {
9367     Loop *L = Worklist.pop_back_val();
9368 
9369     // For the inner loops we actually process, form LCSSA to simplify the
9370     // transform.
9371     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
9372 
9373     Changed |= CFGChanged |= processLoop(L);
9374   }
9375 
9376   // Process each loop nest in the function.
9377   return LoopVectorizeResult(Changed, CFGChanged);
9378 }
9379 
9380 PreservedAnalyses LoopVectorizePass::run(Function &F,
9381                                          FunctionAnalysisManager &AM) {
9382     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
9383     auto &LI = AM.getResult<LoopAnalysis>(F);
9384     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
9385     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
9386     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
9387     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
9388     auto &AA = AM.getResult<AAManager>(F);
9389     auto &AC = AM.getResult<AssumptionAnalysis>(F);
9390     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
9391     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
9392     MemorySSA *MSSA = EnableMSSALoopDependency
9393                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
9394                           : nullptr;
9395 
9396     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
9397     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
9398         [&](Loop &L) -> const LoopAccessInfo & {
9399       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
9400                                         TLI, TTI, nullptr, MSSA};
9401       return LAM.getResult<LoopAccessAnalysis>(L, AR);
9402     };
9403     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
9404     ProfileSummaryInfo *PSI =
9405         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
9406     LoopVectorizeResult Result =
9407         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
9408     if (!Result.MadeAnyChange)
9409       return PreservedAnalyses::all();
9410     PreservedAnalyses PA;
9411 
9412     // We currently do not preserve loopinfo/dominator analyses with outer loop
9413     // vectorization. Until this is addressed, mark these analyses as preserved
9414     // only for non-VPlan-native path.
9415     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
9416     if (!EnableVPlanNativePath) {
9417       PA.preserve<LoopAnalysis>();
9418       PA.preserve<DominatorTreeAnalysis>();
9419     }
9420     PA.preserve<BasicAA>();
9421     PA.preserve<GlobalsAA>();
9422     if (!Result.MadeCFGChange)
9423       PA.preserveSet<CFGAnalyses>();
9424     return PA;
9425 }
9426