1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 #ifndef NDEBUG
161 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
162 #endif
163 
164 /// @{
165 /// Metadata attribute names
166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167 const char LLVMLoopVectorizeFollowupVectorized[] =
168     "llvm.loop.vectorize.followup_vectorized";
169 const char LLVMLoopVectorizeFollowupEpilogue[] =
170     "llvm.loop.vectorize.followup_epilogue";
171 /// @}
172 
173 STATISTIC(LoopsVectorized, "Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176 
177 static cl::opt<bool> EnableEpilogueVectorization(
178     "enable-epilogue-vectorization", cl::init(false), cl::Hidden,
179     cl::desc("Enable vectorization of epilogue loops."));
180 
181 static cl::opt<unsigned> EpilogueVectorizationForceVF(
182     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183     cl::desc("When epilogue vectorization is enabled, and a value greater than "
184              "1 is specified, forces the given VF for all applicable epilogue "
185              "loops."));
186 
187 static cl::opt<unsigned> EpilogueVectorizationMinVF(
188     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189     cl::desc("Only loops with vectorization factor equal to or larger than "
190              "the specified value are considered for epilogue vectorization."));
191 
192 /// Loops with a known constant trip count below this number are vectorized only
193 /// if no scalar iteration overheads are incurred.
194 static cl::opt<unsigned> TinyTripCountVectorThreshold(
195     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196     cl::desc("Loops with a constant trip count that is smaller than this "
197              "value are vectorized only if no scalar iteration overheads "
198              "are incurred."));
199 
200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
201 // that predication is preferred, and this lists all options. I.e., the
202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
203 // and predicate the instructions accordingly. If tail-folding fails, there are
204 // different fallback strategies depending on these values:
205 namespace PreferPredicateTy {
206   enum Option {
207     ScalarEpilogue = 0,
208     PredicateElseScalarEpilogue,
209     PredicateOrDontVectorize
210   };
211 } // namespace PreferPredicateTy
212 
213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
214     "prefer-predicate-over-epilogue",
215     cl::init(PreferPredicateTy::ScalarEpilogue),
216     cl::Hidden,
217     cl::desc("Tail-folding and predication preferences over creating a scalar "
218              "epilogue loop."),
219     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
220                          "scalar-epilogue",
221                          "Don't tail-predicate loops, create scalar epilogue"),
222               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
223                          "predicate-else-scalar-epilogue",
224                          "prefer tail-folding, create scalar epilogue if tail "
225                          "folding fails."),
226               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
227                          "predicate-dont-vectorize",
228                          "prefers tail-folding, don't attempt vectorization if "
229                          "tail-folding fails.")));
230 
231 static cl::opt<bool> MaximizeBandwidth(
232     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
233     cl::desc("Maximize bandwidth when selecting vectorization factor which "
234              "will be determined by the smallest type in loop."));
235 
236 static cl::opt<bool> EnableInterleavedMemAccesses(
237     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
238     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
239 
240 /// An interleave-group may need masking if it resides in a block that needs
241 /// predication, or in order to mask away gaps.
242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
243     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
245 
246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
247     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
248     cl::desc("We don't interleave loops with a estimated constant trip count "
249              "below this number"));
250 
251 static cl::opt<unsigned> ForceTargetNumScalarRegs(
252     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
253     cl::desc("A flag that overrides the target's number of scalar registers."));
254 
255 static cl::opt<unsigned> ForceTargetNumVectorRegs(
256     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of vector registers."));
258 
259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
260     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's max interleave factor for "
262              "scalar loops."));
263 
264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
265     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
266     cl::desc("A flag that overrides the target's max interleave factor for "
267              "vectorized loops."));
268 
269 static cl::opt<unsigned> ForceTargetInstructionCost(
270     "force-target-instruction-cost", cl::init(0), cl::Hidden,
271     cl::desc("A flag that overrides the target's expected cost for "
272              "an instruction to a single constant value. Mostly "
273              "useful for getting consistent testing."));
274 
275 static cl::opt<unsigned> SmallLoopCost(
276     "small-loop-cost", cl::init(20), cl::Hidden,
277     cl::desc(
278         "The cost of a loop that is considered 'small' by the interleaver."));
279 
280 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
281     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
282     cl::desc("Enable the use of the block frequency analysis to access PGO "
283              "heuristics minimizing code growth in cold regions and being more "
284              "aggressive in hot regions."));
285 
286 // Runtime interleave loops for load/store throughput.
287 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
288     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
289     cl::desc(
290         "Enable runtime interleaving until load/store ports are saturated"));
291 
292 /// Interleave small loops with scalar reductions.
293 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
294     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
295     cl::desc("Enable interleaving for loops with small iteration counts that "
296              "contain scalar reductions to expose ILP."));
297 
298 /// The number of stores in a loop that are allowed to need predication.
299 static cl::opt<unsigned> NumberOfStoresToPredicate(
300     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
301     cl::desc("Max number of stores to be predicated behind an if."));
302 
303 static cl::opt<bool> EnableIndVarRegisterHeur(
304     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
305     cl::desc("Count the induction variable only once when interleaving"));
306 
307 static cl::opt<bool> EnableCondStoresVectorization(
308     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
309     cl::desc("Enable if predication of stores during vectorization."));
310 
311 static cl::opt<unsigned> MaxNestedScalarReductionIC(
312     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
313     cl::desc("The maximum interleave count to use when interleaving a scalar "
314              "reduction in a nested loop."));
315 
316 static cl::opt<bool>
317     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
318                            cl::Hidden,
319                            cl::desc("Prefer in-loop vector reductions, "
320                                     "overriding the targets preference."));
321 
322 static cl::opt<bool> PreferPredicatedReductionSelect(
323     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
324     cl::desc(
325         "Prefer predicating a reduction operation over an after loop select."));
326 
327 cl::opt<bool> EnableVPlanNativePath(
328     "enable-vplan-native-path", cl::init(false), cl::Hidden,
329     cl::desc("Enable VPlan-native vectorization path with "
330              "support for outer loop vectorization."));
331 
332 // FIXME: Remove this switch once we have divergence analysis. Currently we
333 // assume divergent non-backedge branches when this switch is true.
334 cl::opt<bool> EnableVPlanPredication(
335     "enable-vplan-predication", cl::init(false), cl::Hidden,
336     cl::desc("Enable VPlan-native vectorization path predicator with "
337              "support for outer loop vectorization."));
338 
339 // This flag enables the stress testing of the VPlan H-CFG construction in the
340 // VPlan-native vectorization path. It must be used in conjuction with
341 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
342 // verification of the H-CFGs built.
343 static cl::opt<bool> VPlanBuildStressTest(
344     "vplan-build-stress-test", cl::init(false), cl::Hidden,
345     cl::desc(
346         "Build VPlan for every supported loop nest in the function and bail "
347         "out right after the build (stress test the VPlan H-CFG construction "
348         "in the VPlan-native vectorization path)."));
349 
350 cl::opt<bool> llvm::EnableLoopInterleaving(
351     "interleave-loops", cl::init(true), cl::Hidden,
352     cl::desc("Enable loop interleaving in Loop vectorization passes"));
353 cl::opt<bool> llvm::EnableLoopVectorization(
354     "vectorize-loops", cl::init(true), cl::Hidden,
355     cl::desc("Run the Loop vectorization passes"));
356 
357 /// A helper function that returns the type of loaded or stored value.
358 static Type *getMemInstValueType(Value *I) {
359   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
360          "Expected Load or Store instruction");
361   if (auto *LI = dyn_cast<LoadInst>(I))
362     return LI->getType();
363   return cast<StoreInst>(I)->getValueOperand()->getType();
364 }
365 
366 /// A helper function that returns true if the given type is irregular. The
367 /// type is irregular if its allocated size doesn't equal the store size of an
368 /// element of the corresponding vector type at the given vectorization factor.
369 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
370   assert(!VF.isScalable() && "scalable vectors not yet supported.");
371   // Determine if an array of VF elements of type Ty is "bitcast compatible"
372   // with a <VF x Ty> vector.
373   if (VF.isVector()) {
374     auto *VectorTy = VectorType::get(Ty, VF);
375     return TypeSize::get(VF.getKnownMinValue() *
376                              DL.getTypeAllocSize(Ty).getFixedValue(),
377                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
378   }
379 
380   // If the vectorization factor is one, we just check if an array of type Ty
381   // requires padding between elements.
382   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
383 }
384 
385 /// A helper function that returns the reciprocal of the block probability of
386 /// predicated blocks. If we return X, we are assuming the predicated block
387 /// will execute once for every X iterations of the loop header.
388 ///
389 /// TODO: We should use actual block probability here, if available. Currently,
390 ///       we always assume predicated blocks have a 50% chance of executing.
391 static unsigned getReciprocalPredBlockProb() { return 2; }
392 
393 /// A helper function that adds a 'fast' flag to floating-point operations.
394 static Value *addFastMathFlag(Value *V) {
395   if (isa<FPMathOperator>(V))
396     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
397   return V;
398 }
399 
400 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
401   if (isa<FPMathOperator>(V))
402     cast<Instruction>(V)->setFastMathFlags(FMF);
403   return V;
404 }
405 
406 /// A helper function that returns an integer or floating-point constant with
407 /// value C.
408 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
409   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
410                            : ConstantFP::get(Ty, C);
411 }
412 
413 /// Returns "best known" trip count for the specified loop \p L as defined by
414 /// the following procedure:
415 ///   1) Returns exact trip count if it is known.
416 ///   2) Returns expected trip count according to profile data if any.
417 ///   3) Returns upper bound estimate if it is known.
418 ///   4) Returns None if all of the above failed.
419 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
420   // Check if exact trip count is known.
421   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
422     return ExpectedTC;
423 
424   // Check if there is an expected trip count available from profile data.
425   if (LoopVectorizeWithBlockFrequency)
426     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
427       return EstimatedTC;
428 
429   // Check if upper bound estimate is known.
430   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
431     return ExpectedTC;
432 
433   return None;
434 }
435 
436 namespace llvm {
437 
438 /// InnerLoopVectorizer vectorizes loops which contain only one basic
439 /// block to a specified vectorization factor (VF).
440 /// This class performs the widening of scalars into vectors, or multiple
441 /// scalars. This class also implements the following features:
442 /// * It inserts an epilogue loop for handling loops that don't have iteration
443 ///   counts that are known to be a multiple of the vectorization factor.
444 /// * It handles the code generation for reduction variables.
445 /// * Scalarization (implementation using scalars) of un-vectorizable
446 ///   instructions.
447 /// InnerLoopVectorizer does not perform any vectorization-legality
448 /// checks, and relies on the caller to check for the different legality
449 /// aspects. The InnerLoopVectorizer relies on the
450 /// LoopVectorizationLegality class to provide information about the induction
451 /// and reduction variables that were found to a given vectorization factor.
452 class InnerLoopVectorizer {
453 public:
454   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
455                       LoopInfo *LI, DominatorTree *DT,
456                       const TargetLibraryInfo *TLI,
457                       const TargetTransformInfo *TTI, AssumptionCache *AC,
458                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
459                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
460                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
461                       ProfileSummaryInfo *PSI)
462       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
463         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
464         Builder(PSE.getSE()->getContext()),
465         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
466         BFI(BFI), PSI(PSI) {
467     // Query this against the original loop and save it here because the profile
468     // of the original loop header may change as the transformation happens.
469     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
470         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
471   }
472 
473   virtual ~InnerLoopVectorizer() = default;
474 
475   /// Create a new empty loop that will contain vectorized instructions later
476   /// on, while the old loop will be used as the scalar remainder. Control flow
477   /// is generated around the vectorized (and scalar epilogue) loops consisting
478   /// of various checks and bypasses. Return the pre-header block of the new
479   /// loop.
480   /// In the case of epilogue vectorization, this function is overriden to
481   /// handle the more complex control flow around the loops.
482   virtual BasicBlock *createVectorizedLoopSkeleton();
483 
484   /// Widen a single instruction within the innermost loop.
485   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
486                         VPTransformState &State);
487 
488   /// Widen a single call instruction within the innermost loop.
489   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
490                             VPTransformState &State);
491 
492   /// Widen a single select instruction within the innermost loop.
493   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
494                               bool InvariantCond, VPTransformState &State);
495 
496   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
497   void fixVectorizedLoop();
498 
499   // Return true if any runtime check is added.
500   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
501 
502   /// A type for vectorized values in the new loop. Each value from the
503   /// original loop, when vectorized, is represented by UF vector values in the
504   /// new unrolled loop, where UF is the unroll factor.
505   using VectorParts = SmallVector<Value *, 2>;
506 
507   /// Vectorize a single GetElementPtrInst based on information gathered and
508   /// decisions taken during planning.
509   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
510                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
511                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
512 
513   /// Vectorize a single PHINode in a block. This method handles the induction
514   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
515   /// arbitrary length vectors.
516   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
517 
518   /// A helper function to scalarize a single Instruction in the innermost loop.
519   /// Generates a sequence of scalar instances for each lane between \p MinLane
520   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
521   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
522   /// Instr's operands.
523   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
524                             const VPIteration &Instance, bool IfPredicateInstr,
525                             VPTransformState &State);
526 
527   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
528   /// is provided, the integer induction variable will first be truncated to
529   /// the corresponding type.
530   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
531 
532   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
533   /// vector or scalar value on-demand if one is not yet available. When
534   /// vectorizing a loop, we visit the definition of an instruction before its
535   /// uses. When visiting the definition, we either vectorize or scalarize the
536   /// instruction, creating an entry for it in the corresponding map. (In some
537   /// cases, such as induction variables, we will create both vector and scalar
538   /// entries.) Then, as we encounter uses of the definition, we derive values
539   /// for each scalar or vector use unless such a value is already available.
540   /// For example, if we scalarize a definition and one of its uses is vector,
541   /// we build the required vector on-demand with an insertelement sequence
542   /// when visiting the use. Otherwise, if the use is scalar, we can use the
543   /// existing scalar definition.
544   ///
545   /// Return a value in the new loop corresponding to \p V from the original
546   /// loop at unroll index \p Part. If the value has already been vectorized,
547   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
548   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
549   /// a new vector value on-demand by inserting the scalar values into a vector
550   /// with an insertelement sequence. If the value has been neither vectorized
551   /// nor scalarized, it must be loop invariant, so we simply broadcast the
552   /// value into a vector.
553   Value *getOrCreateVectorValue(Value *V, unsigned Part);
554 
555   void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
556     VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
557   }
558 
559   /// Return a value in the new loop corresponding to \p V from the original
560   /// loop at unroll and vector indices \p Instance. If the value has been
561   /// vectorized but not scalarized, the necessary extractelement instruction
562   /// will be generated.
563   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
564 
565   /// Construct the vector value of a scalarized value \p V one lane at a time.
566   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
567 
568   /// Try to vectorize interleaved access group \p Group with the base address
569   /// given in \p Addr, optionally masking the vector operations if \p
570   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
571   /// values in the vectorized loop.
572   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
573                                 VPTransformState &State, VPValue *Addr,
574                                 ArrayRef<VPValue *> StoredValues,
575                                 VPValue *BlockInMask = nullptr);
576 
577   /// Vectorize Load and Store instructions with the base address given in \p
578   /// Addr, optionally masking the vector operations if \p BlockInMask is
579   /// non-null. Use \p State to translate given VPValues to IR values in the
580   /// vectorized loop.
581   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
582                                   VPValue *Def, VPValue *Addr,
583                                   VPValue *StoredValue, VPValue *BlockInMask);
584 
585   /// Set the debug location in the builder using the debug location in
586   /// the instruction.
587   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
588 
589   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
590   void fixNonInductionPHIs(void);
591 
592 protected:
593   friend class LoopVectorizationPlanner;
594 
595   /// A small list of PHINodes.
596   using PhiVector = SmallVector<PHINode *, 4>;
597 
598   /// A type for scalarized values in the new loop. Each value from the
599   /// original loop, when scalarized, is represented by UF x VF scalar values
600   /// in the new unrolled loop, where UF is the unroll factor and VF is the
601   /// vectorization factor.
602   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
603 
604   /// Set up the values of the IVs correctly when exiting the vector loop.
605   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
606                     Value *CountRoundDown, Value *EndValue,
607                     BasicBlock *MiddleBlock);
608 
609   /// Create a new induction variable inside L.
610   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
611                                    Value *Step, Instruction *DL);
612 
613   /// Handle all cross-iteration phis in the header.
614   void fixCrossIterationPHIs();
615 
616   /// Fix a first-order recurrence. This is the second phase of vectorizing
617   /// this phi node.
618   void fixFirstOrderRecurrence(PHINode *Phi);
619 
620   /// Fix a reduction cross-iteration phi. This is the second phase of
621   /// vectorizing this phi node.
622   void fixReduction(PHINode *Phi);
623 
624   /// Clear NSW/NUW flags from reduction instructions if necessary.
625   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
626 
627   /// The Loop exit block may have single value PHI nodes with some
628   /// incoming value. While vectorizing we only handled real values
629   /// that were defined inside the loop and we should have one value for
630   /// each predecessor of its parent basic block. See PR14725.
631   void fixLCSSAPHIs();
632 
633   /// Iteratively sink the scalarized operands of a predicated instruction into
634   /// the block that was created for it.
635   void sinkScalarOperands(Instruction *PredInst);
636 
637   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
638   /// represented as.
639   void truncateToMinimalBitwidths();
640 
641   /// Create a broadcast instruction. This method generates a broadcast
642   /// instruction (shuffle) for loop invariant values and for the induction
643   /// value. If this is the induction variable then we extend it to N, N+1, ...
644   /// this is needed because each iteration in the loop corresponds to a SIMD
645   /// element.
646   virtual Value *getBroadcastInstrs(Value *V);
647 
648   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
649   /// to each vector element of Val. The sequence starts at StartIndex.
650   /// \p Opcode is relevant for FP induction variable.
651   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
652                                Instruction::BinaryOps Opcode =
653                                Instruction::BinaryOpsEnd);
654 
655   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
656   /// variable on which to base the steps, \p Step is the size of the step, and
657   /// \p EntryVal is the value from the original loop that maps to the steps.
658   /// Note that \p EntryVal doesn't have to be an induction variable - it
659   /// can also be a truncate instruction.
660   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
661                         const InductionDescriptor &ID);
662 
663   /// Create a vector induction phi node based on an existing scalar one. \p
664   /// EntryVal is the value from the original loop that maps to the vector phi
665   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
666   /// truncate instruction, instead of widening the original IV, we widen a
667   /// version of the IV truncated to \p EntryVal's type.
668   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
669                                        Value *Step, Instruction *EntryVal);
670 
671   /// Returns true if an instruction \p I should be scalarized instead of
672   /// vectorized for the chosen vectorization factor.
673   bool shouldScalarizeInstruction(Instruction *I) const;
674 
675   /// Returns true if we should generate a scalar version of \p IV.
676   bool needsScalarInduction(Instruction *IV) const;
677 
678   /// If there is a cast involved in the induction variable \p ID, which should
679   /// be ignored in the vectorized loop body, this function records the
680   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
681   /// cast. We had already proved that the casted Phi is equal to the uncasted
682   /// Phi in the vectorized loop (under a runtime guard), and therefore
683   /// there is no need to vectorize the cast - the same value can be used in the
684   /// vector loop for both the Phi and the cast.
685   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
686   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
687   ///
688   /// \p EntryVal is the value from the original loop that maps to the vector
689   /// phi node and is used to distinguish what is the IV currently being
690   /// processed - original one (if \p EntryVal is a phi corresponding to the
691   /// original IV) or the "newly-created" one based on the proof mentioned above
692   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
693   /// latter case \p EntryVal is a TruncInst and we must not record anything for
694   /// that IV, but it's error-prone to expect callers of this routine to care
695   /// about that, hence this explicit parameter.
696   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
697                                              const Instruction *EntryVal,
698                                              Value *VectorLoopValue,
699                                              unsigned Part,
700                                              unsigned Lane = UINT_MAX);
701 
702   /// Generate a shuffle sequence that will reverse the vector Vec.
703   virtual Value *reverseVector(Value *Vec);
704 
705   /// Returns (and creates if needed) the original loop trip count.
706   Value *getOrCreateTripCount(Loop *NewLoop);
707 
708   /// Returns (and creates if needed) the trip count of the widened loop.
709   Value *getOrCreateVectorTripCount(Loop *NewLoop);
710 
711   /// Returns a bitcasted value to the requested vector type.
712   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
713   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
714                                 const DataLayout &DL);
715 
716   /// Emit a bypass check to see if the vector trip count is zero, including if
717   /// it overflows.
718   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
719 
720   /// Emit a bypass check to see if all of the SCEV assumptions we've
721   /// had to make are correct.
722   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
723 
724   /// Emit bypass checks to check any memory assumptions we may have made.
725   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
726 
727   /// Compute the transformed value of Index at offset StartValue using step
728   /// StepValue.
729   /// For integer induction, returns StartValue + Index * StepValue.
730   /// For pointer induction, returns StartValue[Index * StepValue].
731   /// FIXME: The newly created binary instructions should contain nsw/nuw
732   /// flags, which can be found from the original scalar operations.
733   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
734                               const DataLayout &DL,
735                               const InductionDescriptor &ID) const;
736 
737   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
738   /// vector loop preheader, middle block and scalar preheader. Also
739   /// allocate a loop object for the new vector loop and return it.
740   Loop *createVectorLoopSkeleton(StringRef Prefix);
741 
742   /// Create new phi nodes for the induction variables to resume iteration count
743   /// in the scalar epilogue, from where the vectorized loop left off (given by
744   /// \p VectorTripCount).
745   /// In cases where the loop skeleton is more complicated (eg. epilogue
746   /// vectorization) and the resume values can come from an additional bypass
747   /// block, the \p AdditionalBypass pair provides information about the bypass
748   /// block and the end value on the edge from bypass to this loop.
749   void createInductionResumeValues(
750       Loop *L, Value *VectorTripCount,
751       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
752 
753   /// Complete the loop skeleton by adding debug MDs, creating appropriate
754   /// conditional branches in the middle block, preparing the builder and
755   /// running the verifier. Take in the vector loop \p L as argument, and return
756   /// the preheader of the completed vector loop.
757   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
758 
759   /// Add additional metadata to \p To that was not present on \p Orig.
760   ///
761   /// Currently this is used to add the noalias annotations based on the
762   /// inserted memchecks.  Use this for instructions that are *cloned* into the
763   /// vector loop.
764   void addNewMetadata(Instruction *To, const Instruction *Orig);
765 
766   /// Add metadata from one instruction to another.
767   ///
768   /// This includes both the original MDs from \p From and additional ones (\see
769   /// addNewMetadata).  Use this for *newly created* instructions in the vector
770   /// loop.
771   void addMetadata(Instruction *To, Instruction *From);
772 
773   /// Similar to the previous function but it adds the metadata to a
774   /// vector of instructions.
775   void addMetadata(ArrayRef<Value *> To, Instruction *From);
776 
777   /// Allow subclasses to override and print debug traces before/after vplan
778   /// execution, when trace information is requested.
779   virtual void printDebugTracesAtStart(){};
780   virtual void printDebugTracesAtEnd(){};
781 
782   /// The original loop.
783   Loop *OrigLoop;
784 
785   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
786   /// dynamic knowledge to simplify SCEV expressions and converts them to a
787   /// more usable form.
788   PredicatedScalarEvolution &PSE;
789 
790   /// Loop Info.
791   LoopInfo *LI;
792 
793   /// Dominator Tree.
794   DominatorTree *DT;
795 
796   /// Alias Analysis.
797   AAResults *AA;
798 
799   /// Target Library Info.
800   const TargetLibraryInfo *TLI;
801 
802   /// Target Transform Info.
803   const TargetTransformInfo *TTI;
804 
805   /// Assumption Cache.
806   AssumptionCache *AC;
807 
808   /// Interface to emit optimization remarks.
809   OptimizationRemarkEmitter *ORE;
810 
811   /// LoopVersioning.  It's only set up (non-null) if memchecks were
812   /// used.
813   ///
814   /// This is currently only used to add no-alias metadata based on the
815   /// memchecks.  The actually versioning is performed manually.
816   std::unique_ptr<LoopVersioning> LVer;
817 
818   /// The vectorization SIMD factor to use. Each vector will have this many
819   /// vector elements.
820   ElementCount VF;
821 
822   /// The vectorization unroll factor to use. Each scalar is vectorized to this
823   /// many different vector instructions.
824   unsigned UF;
825 
826   /// The builder that we use
827   IRBuilder<> Builder;
828 
829   // --- Vectorization state ---
830 
831   /// The vector-loop preheader.
832   BasicBlock *LoopVectorPreHeader;
833 
834   /// The scalar-loop preheader.
835   BasicBlock *LoopScalarPreHeader;
836 
837   /// Middle Block between the vector and the scalar.
838   BasicBlock *LoopMiddleBlock;
839 
840   /// The ExitBlock of the scalar loop.
841   BasicBlock *LoopExitBlock;
842 
843   /// The vector loop body.
844   BasicBlock *LoopVectorBody;
845 
846   /// The scalar loop body.
847   BasicBlock *LoopScalarBody;
848 
849   /// A list of all bypass blocks. The first block is the entry of the loop.
850   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
851 
852   /// The new Induction variable which was added to the new block.
853   PHINode *Induction = nullptr;
854 
855   /// The induction variable of the old basic block.
856   PHINode *OldInduction = nullptr;
857 
858   /// Maps values from the original loop to their corresponding values in the
859   /// vectorized loop. A key value can map to either vector values, scalar
860   /// values or both kinds of values, depending on whether the key was
861   /// vectorized and scalarized.
862   VectorizerValueMap VectorLoopValueMap;
863 
864   /// Store instructions that were predicated.
865   SmallVector<Instruction *, 4> PredicatedInstructions;
866 
867   /// Trip count of the original loop.
868   Value *TripCount = nullptr;
869 
870   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
871   Value *VectorTripCount = nullptr;
872 
873   /// The legality analysis.
874   LoopVectorizationLegality *Legal;
875 
876   /// The profitablity analysis.
877   LoopVectorizationCostModel *Cost;
878 
879   // Record whether runtime checks are added.
880   bool AddedSafetyChecks = false;
881 
882   // Holds the end values for each induction variable. We save the end values
883   // so we can later fix-up the external users of the induction variables.
884   DenseMap<PHINode *, Value *> IVEndValues;
885 
886   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
887   // fixed up at the end of vector code generation.
888   SmallVector<PHINode *, 8> OrigPHIsToFix;
889 
890   /// BFI and PSI are used to check for profile guided size optimizations.
891   BlockFrequencyInfo *BFI;
892   ProfileSummaryInfo *PSI;
893 
894   // Whether this loop should be optimized for size based on profile guided size
895   // optimizatios.
896   bool OptForSizeBasedOnProfile;
897 };
898 
899 class InnerLoopUnroller : public InnerLoopVectorizer {
900 public:
901   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
902                     LoopInfo *LI, DominatorTree *DT,
903                     const TargetLibraryInfo *TLI,
904                     const TargetTransformInfo *TTI, AssumptionCache *AC,
905                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
906                     LoopVectorizationLegality *LVL,
907                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
908                     ProfileSummaryInfo *PSI)
909       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
910                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
911                             BFI, PSI) {}
912 
913 private:
914   Value *getBroadcastInstrs(Value *V) override;
915   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
916                        Instruction::BinaryOps Opcode =
917                        Instruction::BinaryOpsEnd) override;
918   Value *reverseVector(Value *Vec) override;
919 };
920 
921 /// Encapsulate information regarding vectorization of a loop and its epilogue.
922 /// This information is meant to be updated and used across two stages of
923 /// epilogue vectorization.
924 struct EpilogueLoopVectorizationInfo {
925   ElementCount MainLoopVF = ElementCount::getFixed(0);
926   unsigned MainLoopUF = 0;
927   ElementCount EpilogueVF = ElementCount::getFixed(0);
928   unsigned EpilogueUF = 0;
929   BasicBlock *MainLoopIterationCountCheck = nullptr;
930   BasicBlock *EpilogueIterationCountCheck = nullptr;
931   BasicBlock *SCEVSafetyCheck = nullptr;
932   BasicBlock *MemSafetyCheck = nullptr;
933   Value *TripCount = nullptr;
934   Value *VectorTripCount = nullptr;
935 
936   EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
937                                 unsigned EUF)
938       : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
939         EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
940     assert(EUF == 1 &&
941            "A high UF for the epilogue loop is likely not beneficial.");
942   }
943 };
944 
945 /// An extension of the inner loop vectorizer that creates a skeleton for a
946 /// vectorized loop that has its epilogue (residual) also vectorized.
947 /// The idea is to run the vplan on a given loop twice, firstly to setup the
948 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
949 /// from the first step and vectorize the epilogue.  This is achieved by
950 /// deriving two concrete strategy classes from this base class and invoking
951 /// them in succession from the loop vectorizer planner.
952 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
953 public:
954   InnerLoopAndEpilogueVectorizer(
955       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
956       DominatorTree *DT, const TargetLibraryInfo *TLI,
957       const TargetTransformInfo *TTI, AssumptionCache *AC,
958       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
959       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
960       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
961       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
962                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
963         EPI(EPI) {}
964 
965   // Override this function to handle the more complex control flow around the
966   // three loops.
967   BasicBlock *createVectorizedLoopSkeleton() final override {
968     return createEpilogueVectorizedLoopSkeleton();
969   }
970 
971   /// The interface for creating a vectorized skeleton using one of two
972   /// different strategies, each corresponding to one execution of the vplan
973   /// as described above.
974   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
975 
976   /// Holds and updates state information required to vectorize the main loop
977   /// and its epilogue in two separate passes. This setup helps us avoid
978   /// regenerating and recomputing runtime safety checks. It also helps us to
979   /// shorten the iteration-count-check path length for the cases where the
980   /// iteration count of the loop is so small that the main vector loop is
981   /// completely skipped.
982   EpilogueLoopVectorizationInfo &EPI;
983 };
984 
985 /// A specialized derived class of inner loop vectorizer that performs
986 /// vectorization of *main* loops in the process of vectorizing loops and their
987 /// epilogues.
988 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
989 public:
990   EpilogueVectorizerMainLoop(
991       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
992       DominatorTree *DT, const TargetLibraryInfo *TLI,
993       const TargetTransformInfo *TTI, AssumptionCache *AC,
994       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
995       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
996       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
997       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
998                                        EPI, LVL, CM, BFI, PSI) {}
999   /// Implements the interface for creating a vectorized skeleton using the
1000   /// *main loop* strategy (ie the first pass of vplan execution).
1001   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1002 
1003 protected:
1004   /// Emits an iteration count bypass check once for the main loop (when \p
1005   /// ForEpilogue is false) and once for the epilogue loop (when \p
1006   /// ForEpilogue is true).
1007   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
1008                                              bool ForEpilogue);
1009   void printDebugTracesAtStart() override;
1010   void printDebugTracesAtEnd() override;
1011 };
1012 
1013 // A specialized derived class of inner loop vectorizer that performs
1014 // vectorization of *epilogue* loops in the process of vectorizing loops and
1015 // their epilogues.
1016 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
1017 public:
1018   EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
1019                     LoopInfo *LI, DominatorTree *DT,
1020                     const TargetLibraryInfo *TLI,
1021                     const TargetTransformInfo *TTI, AssumptionCache *AC,
1022                     OptimizationRemarkEmitter *ORE,
1023                     EpilogueLoopVectorizationInfo &EPI,
1024                     LoopVectorizationLegality *LVL,
1025                     llvm::LoopVectorizationCostModel *CM,
1026                     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
1027       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1028                                        EPI, LVL, CM, BFI, PSI) {}
1029   /// Implements the interface for creating a vectorized skeleton using the
1030   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1031   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1032 
1033 protected:
1034   /// Emits an iteration count bypass check after the main vector loop has
1035   /// finished to see if there are any iterations left to execute by either
1036   /// the vector epilogue or the scalar epilogue.
1037   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1038                                                       BasicBlock *Bypass,
1039                                                       BasicBlock *Insert);
1040   void printDebugTracesAtStart() override;
1041   void printDebugTracesAtEnd() override;
1042 };
1043 } // end namespace llvm
1044 
1045 /// Look for a meaningful debug location on the instruction or it's
1046 /// operands.
1047 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1048   if (!I)
1049     return I;
1050 
1051   DebugLoc Empty;
1052   if (I->getDebugLoc() != Empty)
1053     return I;
1054 
1055   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
1056     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
1057       if (OpInst->getDebugLoc() != Empty)
1058         return OpInst;
1059   }
1060 
1061   return I;
1062 }
1063 
1064 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1065   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1066     const DILocation *DIL = Inst->getDebugLoc();
1067     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1068         !isa<DbgInfoIntrinsic>(Inst)) {
1069       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1070       auto NewDIL =
1071           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1072       if (NewDIL)
1073         B.SetCurrentDebugLocation(NewDIL.getValue());
1074       else
1075         LLVM_DEBUG(dbgs()
1076                    << "Failed to create new discriminator: "
1077                    << DIL->getFilename() << " Line: " << DIL->getLine());
1078     }
1079     else
1080       B.SetCurrentDebugLocation(DIL);
1081   } else
1082     B.SetCurrentDebugLocation(DebugLoc());
1083 }
1084 
1085 /// Write a record \p DebugMsg about vectorization failure to the debug
1086 /// output stream. If \p I is passed, it is an instruction that prevents
1087 /// vectorization.
1088 #ifndef NDEBUG
1089 static void debugVectorizationFailure(const StringRef DebugMsg,
1090     Instruction *I) {
1091   dbgs() << "LV: Not vectorizing: " << DebugMsg;
1092   if (I != nullptr)
1093     dbgs() << " " << *I;
1094   else
1095     dbgs() << '.';
1096   dbgs() << '\n';
1097 }
1098 #endif
1099 
1100 /// Create an analysis remark that explains why vectorization failed
1101 ///
1102 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1103 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1104 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1105 /// the location of the remark.  \return the remark object that can be
1106 /// streamed to.
1107 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1108     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1109   Value *CodeRegion = TheLoop->getHeader();
1110   DebugLoc DL = TheLoop->getStartLoc();
1111 
1112   if (I) {
1113     CodeRegion = I->getParent();
1114     // If there is no debug location attached to the instruction, revert back to
1115     // using the loop's.
1116     if (I->getDebugLoc())
1117       DL = I->getDebugLoc();
1118   }
1119 
1120   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1121   R << "loop not vectorized: ";
1122   return R;
1123 }
1124 
1125 namespace llvm {
1126 
1127 void reportVectorizationFailure(const StringRef DebugMsg,
1128     const StringRef OREMsg, const StringRef ORETag,
1129     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1130   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
1131   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1132   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1133                 ORETag, TheLoop, I) << OREMsg);
1134 }
1135 
1136 } // end namespace llvm
1137 
1138 #ifndef NDEBUG
1139 /// \return string containing a file name and a line # for the given loop.
1140 static std::string getDebugLocString(const Loop *L) {
1141   std::string Result;
1142   if (L) {
1143     raw_string_ostream OS(Result);
1144     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1145       LoopDbgLoc.print(OS);
1146     else
1147       // Just print the module name.
1148       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1149     OS.flush();
1150   }
1151   return Result;
1152 }
1153 #endif
1154 
1155 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1156                                          const Instruction *Orig) {
1157   // If the loop was versioned with memchecks, add the corresponding no-alias
1158   // metadata.
1159   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1160     LVer->annotateInstWithNoAlias(To, Orig);
1161 }
1162 
1163 void InnerLoopVectorizer::addMetadata(Instruction *To,
1164                                       Instruction *From) {
1165   propagateMetadata(To, From);
1166   addNewMetadata(To, From);
1167 }
1168 
1169 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1170                                       Instruction *From) {
1171   for (Value *V : To) {
1172     if (Instruction *I = dyn_cast<Instruction>(V))
1173       addMetadata(I, From);
1174   }
1175 }
1176 
1177 namespace llvm {
1178 
1179 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1180 // lowered.
1181 enum ScalarEpilogueLowering {
1182 
1183   // The default: allowing scalar epilogues.
1184   CM_ScalarEpilogueAllowed,
1185 
1186   // Vectorization with OptForSize: don't allow epilogues.
1187   CM_ScalarEpilogueNotAllowedOptSize,
1188 
1189   // A special case of vectorisation with OptForSize: loops with a very small
1190   // trip count are considered for vectorization under OptForSize, thereby
1191   // making sure the cost of their loop body is dominant, free of runtime
1192   // guards and scalar iteration overheads.
1193   CM_ScalarEpilogueNotAllowedLowTripLoop,
1194 
1195   // Loop hint predicate indicating an epilogue is undesired.
1196   CM_ScalarEpilogueNotNeededUsePredicate
1197 };
1198 
1199 /// LoopVectorizationCostModel - estimates the expected speedups due to
1200 /// vectorization.
1201 /// In many cases vectorization is not profitable. This can happen because of
1202 /// a number of reasons. In this class we mainly attempt to predict the
1203 /// expected speedup/slowdowns due to the supported instruction set. We use the
1204 /// TargetTransformInfo to query the different backends for the cost of
1205 /// different operations.
1206 class LoopVectorizationCostModel {
1207 public:
1208   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1209                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1210                              LoopVectorizationLegality *Legal,
1211                              const TargetTransformInfo &TTI,
1212                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1213                              AssumptionCache *AC,
1214                              OptimizationRemarkEmitter *ORE, const Function *F,
1215                              const LoopVectorizeHints *Hints,
1216                              InterleavedAccessInfo &IAI)
1217       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1218         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1219         Hints(Hints), InterleaveInfo(IAI) {}
1220 
1221   /// \return An upper bound for the vectorization factor, or None if
1222   /// vectorization and interleaving should be avoided up front.
1223   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1224 
1225   /// \return True if runtime checks are required for vectorization, and false
1226   /// otherwise.
1227   bool runtimeChecksRequired();
1228 
1229   /// \return The most profitable vectorization factor and the cost of that VF.
1230   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1231   /// then this vectorization factor will be selected if vectorization is
1232   /// possible.
1233   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1234   VectorizationFactor
1235   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1236                                     const LoopVectorizationPlanner &LVP);
1237 
1238   /// Setup cost-based decisions for user vectorization factor.
1239   void selectUserVectorizationFactor(ElementCount UserVF) {
1240     collectUniformsAndScalars(UserVF);
1241     collectInstsToScalarize(UserVF);
1242   }
1243 
1244   /// \return The size (in bits) of the smallest and widest types in the code
1245   /// that needs to be vectorized. We ignore values that remain scalar such as
1246   /// 64 bit loop indices.
1247   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1248 
1249   /// \return The desired interleave count.
1250   /// If interleave count has been specified by metadata it will be returned.
1251   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1252   /// are the selected vectorization factor and the cost of the selected VF.
1253   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1254 
1255   /// Memory access instruction may be vectorized in more than one way.
1256   /// Form of instruction after vectorization depends on cost.
1257   /// This function takes cost-based decisions for Load/Store instructions
1258   /// and collects them in a map. This decisions map is used for building
1259   /// the lists of loop-uniform and loop-scalar instructions.
1260   /// The calculated cost is saved with widening decision in order to
1261   /// avoid redundant calculations.
1262   void setCostBasedWideningDecision(ElementCount VF);
1263 
1264   /// A struct that represents some properties of the register usage
1265   /// of a loop.
1266   struct RegisterUsage {
1267     /// Holds the number of loop invariant values that are used in the loop.
1268     /// The key is ClassID of target-provided register class.
1269     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1270     /// Holds the maximum number of concurrent live intervals in the loop.
1271     /// The key is ClassID of target-provided register class.
1272     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1273   };
1274 
1275   /// \return Returns information about the register usages of the loop for the
1276   /// given vectorization factors.
1277   SmallVector<RegisterUsage, 8>
1278   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1279 
1280   /// Collect values we want to ignore in the cost model.
1281   void collectValuesToIgnore();
1282 
1283   /// Split reductions into those that happen in the loop, and those that happen
1284   /// outside. In loop reductions are collected into InLoopReductionChains.
1285   void collectInLoopReductions();
1286 
1287   /// \returns The smallest bitwidth each instruction can be represented with.
1288   /// The vector equivalents of these instructions should be truncated to this
1289   /// type.
1290   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1291     return MinBWs;
1292   }
1293 
1294   /// \returns True if it is more profitable to scalarize instruction \p I for
1295   /// vectorization factor \p VF.
1296   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1297     assert(VF.isVector() &&
1298            "Profitable to scalarize relevant only for VF > 1.");
1299 
1300     // Cost model is not run in the VPlan-native path - return conservative
1301     // result until this changes.
1302     if (EnableVPlanNativePath)
1303       return false;
1304 
1305     auto Scalars = InstsToScalarize.find(VF);
1306     assert(Scalars != InstsToScalarize.end() &&
1307            "VF not yet analyzed for scalarization profitability");
1308     return Scalars->second.find(I) != Scalars->second.end();
1309   }
1310 
1311   /// Returns true if \p I is known to be uniform after vectorization.
1312   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1313     if (VF.isScalar())
1314       return true;
1315 
1316     // Cost model is not run in the VPlan-native path - return conservative
1317     // result until this changes.
1318     if (EnableVPlanNativePath)
1319       return false;
1320 
1321     auto UniformsPerVF = Uniforms.find(VF);
1322     assert(UniformsPerVF != Uniforms.end() &&
1323            "VF not yet analyzed for uniformity");
1324     return UniformsPerVF->second.count(I);
1325   }
1326 
1327   /// Returns true if \p I is known to be scalar after vectorization.
1328   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1329     if (VF.isScalar())
1330       return true;
1331 
1332     // Cost model is not run in the VPlan-native path - return conservative
1333     // result until this changes.
1334     if (EnableVPlanNativePath)
1335       return false;
1336 
1337     auto ScalarsPerVF = Scalars.find(VF);
1338     assert(ScalarsPerVF != Scalars.end() &&
1339            "Scalar values are not calculated for VF");
1340     return ScalarsPerVF->second.count(I);
1341   }
1342 
1343   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1344   /// for vectorization factor \p VF.
1345   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1346     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1347            !isProfitableToScalarize(I, VF) &&
1348            !isScalarAfterVectorization(I, VF);
1349   }
1350 
1351   /// Decision that was taken during cost calculation for memory instruction.
1352   enum InstWidening {
1353     CM_Unknown,
1354     CM_Widen,         // For consecutive accesses with stride +1.
1355     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1356     CM_Interleave,
1357     CM_GatherScatter,
1358     CM_Scalarize
1359   };
1360 
1361   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1362   /// instruction \p I and vector width \p VF.
1363   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1364                            unsigned Cost) {
1365     assert(VF.isVector() && "Expected VF >=2");
1366     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1367   }
1368 
1369   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1370   /// interleaving group \p Grp and vector width \p VF.
1371   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1372                            ElementCount VF, InstWidening W, unsigned Cost) {
1373     assert(VF.isVector() && "Expected VF >=2");
1374     /// Broadcast this decicion to all instructions inside the group.
1375     /// But the cost will be assigned to one instruction only.
1376     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1377       if (auto *I = Grp->getMember(i)) {
1378         if (Grp->getInsertPos() == I)
1379           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1380         else
1381           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1382       }
1383     }
1384   }
1385 
1386   /// Return the cost model decision for the given instruction \p I and vector
1387   /// width \p VF. Return CM_Unknown if this instruction did not pass
1388   /// through the cost modeling.
1389   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1390     assert(!VF.isScalable() && "scalable vectors not yet supported.");
1391     assert(VF.isVector() && "Expected VF >=2");
1392 
1393     // Cost model is not run in the VPlan-native path - return conservative
1394     // result until this changes.
1395     if (EnableVPlanNativePath)
1396       return CM_GatherScatter;
1397 
1398     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1399     auto Itr = WideningDecisions.find(InstOnVF);
1400     if (Itr == WideningDecisions.end())
1401       return CM_Unknown;
1402     return Itr->second.first;
1403   }
1404 
1405   /// Return the vectorization cost for the given instruction \p I and vector
1406   /// width \p VF.
1407   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1408     assert(VF.isVector() && "Expected VF >=2");
1409     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1410     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1411            "The cost is not calculated");
1412     return WideningDecisions[InstOnVF].second;
1413   }
1414 
1415   /// Return True if instruction \p I is an optimizable truncate whose operand
1416   /// is an induction variable. Such a truncate will be removed by adding a new
1417   /// induction variable with the destination type.
1418   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1419     // If the instruction is not a truncate, return false.
1420     auto *Trunc = dyn_cast<TruncInst>(I);
1421     if (!Trunc)
1422       return false;
1423 
1424     // Get the source and destination types of the truncate.
1425     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1426     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1427 
1428     // If the truncate is free for the given types, return false. Replacing a
1429     // free truncate with an induction variable would add an induction variable
1430     // update instruction to each iteration of the loop. We exclude from this
1431     // check the primary induction variable since it will need an update
1432     // instruction regardless.
1433     Value *Op = Trunc->getOperand(0);
1434     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1435       return false;
1436 
1437     // If the truncated value is not an induction variable, return false.
1438     return Legal->isInductionPhi(Op);
1439   }
1440 
1441   /// Collects the instructions to scalarize for each predicated instruction in
1442   /// the loop.
1443   void collectInstsToScalarize(ElementCount VF);
1444 
1445   /// Collect Uniform and Scalar values for the given \p VF.
1446   /// The sets depend on CM decision for Load/Store instructions
1447   /// that may be vectorized as interleave, gather-scatter or scalarized.
1448   void collectUniformsAndScalars(ElementCount VF) {
1449     // Do the analysis once.
1450     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1451       return;
1452     setCostBasedWideningDecision(VF);
1453     collectLoopUniforms(VF);
1454     collectLoopScalars(VF);
1455   }
1456 
1457   /// Returns true if the target machine supports masked store operation
1458   /// for the given \p DataType and kind of access to \p Ptr.
1459   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1460     return Legal->isConsecutivePtr(Ptr) &&
1461            TTI.isLegalMaskedStore(DataType, Alignment);
1462   }
1463 
1464   /// Returns true if the target machine supports masked load operation
1465   /// for the given \p DataType and kind of access to \p Ptr.
1466   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1467     return Legal->isConsecutivePtr(Ptr) &&
1468            TTI.isLegalMaskedLoad(DataType, Alignment);
1469   }
1470 
1471   /// Returns true if the target machine supports masked scatter operation
1472   /// for the given \p DataType.
1473   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1474     return TTI.isLegalMaskedScatter(DataType, Alignment);
1475   }
1476 
1477   /// Returns true if the target machine supports masked gather operation
1478   /// for the given \p DataType.
1479   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1480     return TTI.isLegalMaskedGather(DataType, Alignment);
1481   }
1482 
1483   /// Returns true if the target machine can represent \p V as a masked gather
1484   /// or scatter operation.
1485   bool isLegalGatherOrScatter(Value *V) {
1486     bool LI = isa<LoadInst>(V);
1487     bool SI = isa<StoreInst>(V);
1488     if (!LI && !SI)
1489       return false;
1490     auto *Ty = getMemInstValueType(V);
1491     Align Align = getLoadStoreAlignment(V);
1492     return (LI && isLegalMaskedGather(Ty, Align)) ||
1493            (SI && isLegalMaskedScatter(Ty, Align));
1494   }
1495 
1496   /// Returns true if \p I is an instruction that will be scalarized with
1497   /// predication. Such instructions include conditional stores and
1498   /// instructions that may divide by zero.
1499   /// If a non-zero VF has been calculated, we check if I will be scalarized
1500   /// predication for that VF.
1501   bool isScalarWithPredication(Instruction *I,
1502                                ElementCount VF = ElementCount::getFixed(1));
1503 
1504   // Returns true if \p I is an instruction that will be predicated either
1505   // through scalar predication or masked load/store or masked gather/scatter.
1506   // Superset of instructions that return true for isScalarWithPredication.
1507   bool isPredicatedInst(Instruction *I) {
1508     if (!blockNeedsPredication(I->getParent()))
1509       return false;
1510     // Loads and stores that need some form of masked operation are predicated
1511     // instructions.
1512     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1513       return Legal->isMaskRequired(I);
1514     return isScalarWithPredication(I);
1515   }
1516 
1517   /// Returns true if \p I is a memory instruction with consecutive memory
1518   /// access that can be widened.
1519   bool
1520   memoryInstructionCanBeWidened(Instruction *I,
1521                                 ElementCount VF = ElementCount::getFixed(1));
1522 
1523   /// Returns true if \p I is a memory instruction in an interleaved-group
1524   /// of memory accesses that can be vectorized with wide vector loads/stores
1525   /// and shuffles.
1526   bool
1527   interleavedAccessCanBeWidened(Instruction *I,
1528                                 ElementCount VF = ElementCount::getFixed(1));
1529 
1530   /// Check if \p Instr belongs to any interleaved access group.
1531   bool isAccessInterleaved(Instruction *Instr) {
1532     return InterleaveInfo.isInterleaved(Instr);
1533   }
1534 
1535   /// Get the interleaved access group that \p Instr belongs to.
1536   const InterleaveGroup<Instruction> *
1537   getInterleavedAccessGroup(Instruction *Instr) {
1538     return InterleaveInfo.getInterleaveGroup(Instr);
1539   }
1540 
1541   /// Returns true if an interleaved group requires a scalar iteration
1542   /// to handle accesses with gaps, and there is nothing preventing us from
1543   /// creating a scalar epilogue.
1544   bool requiresScalarEpilogue() const {
1545     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1546   }
1547 
1548   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1549   /// loop hint annotation.
1550   bool isScalarEpilogueAllowed() const {
1551     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1552   }
1553 
1554   /// Returns true if all loop blocks should be masked to fold tail loop.
1555   bool foldTailByMasking() const { return FoldTailByMasking; }
1556 
1557   bool blockNeedsPredication(BasicBlock *BB) {
1558     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1559   }
1560 
1561   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1562   /// nodes to the chain of instructions representing the reductions. Uses a
1563   /// MapVector to ensure deterministic iteration order.
1564   using ReductionChainMap =
1565       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1566 
1567   /// Return the chain of instructions representing an inloop reduction.
1568   const ReductionChainMap &getInLoopReductionChains() const {
1569     return InLoopReductionChains;
1570   }
1571 
1572   /// Returns true if the Phi is part of an inloop reduction.
1573   bool isInLoopReduction(PHINode *Phi) const {
1574     return InLoopReductionChains.count(Phi);
1575   }
1576 
1577   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1578   /// with factor VF.  Return the cost of the instruction, including
1579   /// scalarization overhead if it's needed.
1580   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1581 
1582   /// Estimate cost of a call instruction CI if it were vectorized with factor
1583   /// VF. Return the cost of the instruction, including scalarization overhead
1584   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1585   /// scalarized -
1586   /// i.e. either vector version isn't available, or is too expensive.
1587   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1588                              bool &NeedToScalarize);
1589 
1590   /// Invalidates decisions already taken by the cost model.
1591   void invalidateCostModelingDecisions() {
1592     WideningDecisions.clear();
1593     Uniforms.clear();
1594     Scalars.clear();
1595   }
1596 
1597 private:
1598   unsigned NumPredStores = 0;
1599 
1600   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1601   /// than zero. One is returned if vectorization should best be avoided due
1602   /// to cost.
1603   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1604                                     ElementCount UserVF);
1605 
1606   /// The vectorization cost is a combination of the cost itself and a boolean
1607   /// indicating whether any of the contributing operations will actually
1608   /// operate on
1609   /// vector values after type legalization in the backend. If this latter value
1610   /// is
1611   /// false, then all operations will be scalarized (i.e. no vectorization has
1612   /// actually taken place).
1613   using VectorizationCostTy = std::pair<unsigned, bool>;
1614 
1615   /// Returns the expected execution cost. The unit of the cost does
1616   /// not matter because we use the 'cost' units to compare different
1617   /// vector widths. The cost that is returned is *not* normalized by
1618   /// the factor width.
1619   VectorizationCostTy expectedCost(ElementCount VF);
1620 
1621   /// Returns the execution time cost of an instruction for a given vector
1622   /// width. Vector width of one means scalar.
1623   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1624 
1625   /// The cost-computation logic from getInstructionCost which provides
1626   /// the vector type as an output parameter.
1627   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1628 
1629   /// Calculate vectorization cost of memory instruction \p I.
1630   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1631 
1632   /// The cost computation for scalarized memory instruction.
1633   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1634 
1635   /// The cost computation for interleaving group of memory instructions.
1636   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1637 
1638   /// The cost computation for Gather/Scatter instruction.
1639   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1640 
1641   /// The cost computation for widening instruction \p I with consecutive
1642   /// memory access.
1643   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1644 
1645   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1646   /// Load: scalar load + broadcast.
1647   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1648   /// element)
1649   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1650 
1651   /// Estimate the overhead of scalarizing an instruction. This is a
1652   /// convenience wrapper for the type-based getScalarizationOverhead API.
1653   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1654 
1655   /// Returns whether the instruction is a load or store and will be a emitted
1656   /// as a vector operation.
1657   bool isConsecutiveLoadOrStore(Instruction *I);
1658 
1659   /// Returns true if an artificially high cost for emulated masked memrefs
1660   /// should be used.
1661   bool useEmulatedMaskMemRefHack(Instruction *I);
1662 
1663   /// Map of scalar integer values to the smallest bitwidth they can be legally
1664   /// represented as. The vector equivalents of these values should be truncated
1665   /// to this type.
1666   MapVector<Instruction *, uint64_t> MinBWs;
1667 
1668   /// A type representing the costs for instructions if they were to be
1669   /// scalarized rather than vectorized. The entries are Instruction-Cost
1670   /// pairs.
1671   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1672 
1673   /// A set containing all BasicBlocks that are known to present after
1674   /// vectorization as a predicated block.
1675   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1676 
1677   /// Records whether it is allowed to have the original scalar loop execute at
1678   /// least once. This may be needed as a fallback loop in case runtime
1679   /// aliasing/dependence checks fail, or to handle the tail/remainder
1680   /// iterations when the trip count is unknown or doesn't divide by the VF,
1681   /// or as a peel-loop to handle gaps in interleave-groups.
1682   /// Under optsize and when the trip count is very small we don't allow any
1683   /// iterations to execute in the scalar loop.
1684   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1685 
1686   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1687   bool FoldTailByMasking = false;
1688 
1689   /// A map holding scalar costs for different vectorization factors. The
1690   /// presence of a cost for an instruction in the mapping indicates that the
1691   /// instruction will be scalarized when vectorizing with the associated
1692   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1693   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1694 
1695   /// Holds the instructions known to be uniform after vectorization.
1696   /// The data is collected per VF.
1697   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1698 
1699   /// Holds the instructions known to be scalar after vectorization.
1700   /// The data is collected per VF.
1701   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1702 
1703   /// Holds the instructions (address computations) that are forced to be
1704   /// scalarized.
1705   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1706 
1707   /// PHINodes of the reductions that should be expanded in-loop along with
1708   /// their associated chains of reduction operations, in program order from top
1709   /// (PHI) to bottom
1710   ReductionChainMap InLoopReductionChains;
1711 
1712   /// Returns the expected difference in cost from scalarizing the expression
1713   /// feeding a predicated instruction \p PredInst. The instructions to
1714   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1715   /// non-negative return value implies the expression will be scalarized.
1716   /// Currently, only single-use chains are considered for scalarization.
1717   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1718                               ElementCount VF);
1719 
1720   /// Collect the instructions that are uniform after vectorization. An
1721   /// instruction is uniform if we represent it with a single scalar value in
1722   /// the vectorized loop corresponding to each vector iteration. Examples of
1723   /// uniform instructions include pointer operands of consecutive or
1724   /// interleaved memory accesses. Note that although uniformity implies an
1725   /// instruction will be scalar, the reverse is not true. In general, a
1726   /// scalarized instruction will be represented by VF scalar values in the
1727   /// vectorized loop, each corresponding to an iteration of the original
1728   /// scalar loop.
1729   void collectLoopUniforms(ElementCount VF);
1730 
1731   /// Collect the instructions that are scalar after vectorization. An
1732   /// instruction is scalar if it is known to be uniform or will be scalarized
1733   /// during vectorization. Non-uniform scalarized instructions will be
1734   /// represented by VF values in the vectorized loop, each corresponding to an
1735   /// iteration of the original scalar loop.
1736   void collectLoopScalars(ElementCount VF);
1737 
1738   /// Keeps cost model vectorization decision and cost for instructions.
1739   /// Right now it is used for memory instructions only.
1740   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1741                                 std::pair<InstWidening, unsigned>>;
1742 
1743   DecisionList WideningDecisions;
1744 
1745   /// Returns true if \p V is expected to be vectorized and it needs to be
1746   /// extracted.
1747   bool needsExtract(Value *V, ElementCount VF) const {
1748     Instruction *I = dyn_cast<Instruction>(V);
1749     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1750         TheLoop->isLoopInvariant(I))
1751       return false;
1752 
1753     // Assume we can vectorize V (and hence we need extraction) if the
1754     // scalars are not computed yet. This can happen, because it is called
1755     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1756     // the scalars are collected. That should be a safe assumption in most
1757     // cases, because we check if the operands have vectorizable types
1758     // beforehand in LoopVectorizationLegality.
1759     return Scalars.find(VF) == Scalars.end() ||
1760            !isScalarAfterVectorization(I, VF);
1761   };
1762 
1763   /// Returns a range containing only operands needing to be extracted.
1764   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1765                                                    ElementCount VF) {
1766     return SmallVector<Value *, 4>(make_filter_range(
1767         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1768   }
1769 
1770   /// Determines if we have the infrastructure to vectorize loop \p L and its
1771   /// epilogue, assuming the main loop is vectorized by \p VF.
1772   bool isCandidateForEpilogueVectorization(const Loop &L,
1773                                            const ElementCount VF) const;
1774 
1775   /// Returns true if epilogue vectorization is considered profitable, and
1776   /// false otherwise.
1777   /// \p VF is the vectorization factor chosen for the original loop.
1778   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1779 
1780 public:
1781   /// The loop that we evaluate.
1782   Loop *TheLoop;
1783 
1784   /// Predicated scalar evolution analysis.
1785   PredicatedScalarEvolution &PSE;
1786 
1787   /// Loop Info analysis.
1788   LoopInfo *LI;
1789 
1790   /// Vectorization legality.
1791   LoopVectorizationLegality *Legal;
1792 
1793   /// Vector target information.
1794   const TargetTransformInfo &TTI;
1795 
1796   /// Target Library Info.
1797   const TargetLibraryInfo *TLI;
1798 
1799   /// Demanded bits analysis.
1800   DemandedBits *DB;
1801 
1802   /// Assumption cache.
1803   AssumptionCache *AC;
1804 
1805   /// Interface to emit optimization remarks.
1806   OptimizationRemarkEmitter *ORE;
1807 
1808   const Function *TheFunction;
1809 
1810   /// Loop Vectorize Hint.
1811   const LoopVectorizeHints *Hints;
1812 
1813   /// The interleave access information contains groups of interleaved accesses
1814   /// with the same stride and close to each other.
1815   InterleavedAccessInfo &InterleaveInfo;
1816 
1817   /// Values to ignore in the cost model.
1818   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1819 
1820   /// Values to ignore in the cost model when VF > 1.
1821   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1822 
1823   /// Profitable vector factors.
1824   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1825 };
1826 
1827 } // end namespace llvm
1828 
1829 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1830 // vectorization. The loop needs to be annotated with #pragma omp simd
1831 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1832 // vector length information is not provided, vectorization is not considered
1833 // explicit. Interleave hints are not allowed either. These limitations will be
1834 // relaxed in the future.
1835 // Please, note that we are currently forced to abuse the pragma 'clang
1836 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1837 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1838 // provides *explicit vectorization hints* (LV can bypass legal checks and
1839 // assume that vectorization is legal). However, both hints are implemented
1840 // using the same metadata (llvm.loop.vectorize, processed by
1841 // LoopVectorizeHints). This will be fixed in the future when the native IR
1842 // representation for pragma 'omp simd' is introduced.
1843 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1844                                    OptimizationRemarkEmitter *ORE) {
1845   assert(!OuterLp->isInnermost() && "This is not an outer loop");
1846   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1847 
1848   // Only outer loops with an explicit vectorization hint are supported.
1849   // Unannotated outer loops are ignored.
1850   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1851     return false;
1852 
1853   Function *Fn = OuterLp->getHeader()->getParent();
1854   if (!Hints.allowVectorization(Fn, OuterLp,
1855                                 true /*VectorizeOnlyWhenForced*/)) {
1856     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1857     return false;
1858   }
1859 
1860   if (Hints.getInterleave() > 1) {
1861     // TODO: Interleave support is future work.
1862     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1863                          "outer loops.\n");
1864     Hints.emitRemarkWithHints();
1865     return false;
1866   }
1867 
1868   return true;
1869 }
1870 
1871 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1872                                   OptimizationRemarkEmitter *ORE,
1873                                   SmallVectorImpl<Loop *> &V) {
1874   // Collect inner loops and outer loops without irreducible control flow. For
1875   // now, only collect outer loops that have explicit vectorization hints. If we
1876   // are stress testing the VPlan H-CFG construction, we collect the outermost
1877   // loop of every loop nest.
1878   if (L.isInnermost() || VPlanBuildStressTest ||
1879       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1880     LoopBlocksRPO RPOT(&L);
1881     RPOT.perform(LI);
1882     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1883       V.push_back(&L);
1884       // TODO: Collect inner loops inside marked outer loops in case
1885       // vectorization fails for the outer loop. Do not invoke
1886       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1887       // already known to be reducible. We can use an inherited attribute for
1888       // that.
1889       return;
1890     }
1891   }
1892   for (Loop *InnerL : L)
1893     collectSupportedLoops(*InnerL, LI, ORE, V);
1894 }
1895 
1896 namespace {
1897 
1898 /// The LoopVectorize Pass.
1899 struct LoopVectorize : public FunctionPass {
1900   /// Pass identification, replacement for typeid
1901   static char ID;
1902 
1903   LoopVectorizePass Impl;
1904 
1905   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1906                          bool VectorizeOnlyWhenForced = false)
1907       : FunctionPass(ID),
1908         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1909     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1910   }
1911 
1912   bool runOnFunction(Function &F) override {
1913     if (skipFunction(F))
1914       return false;
1915 
1916     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1917     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1918     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1919     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1920     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1921     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1922     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1923     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1924     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1925     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1926     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1927     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1928     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1929 
1930     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1931         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1932 
1933     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1934                         GetLAA, *ORE, PSI).MadeAnyChange;
1935   }
1936 
1937   void getAnalysisUsage(AnalysisUsage &AU) const override {
1938     AU.addRequired<AssumptionCacheTracker>();
1939     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1940     AU.addRequired<DominatorTreeWrapperPass>();
1941     AU.addRequired<LoopInfoWrapperPass>();
1942     AU.addRequired<ScalarEvolutionWrapperPass>();
1943     AU.addRequired<TargetTransformInfoWrapperPass>();
1944     AU.addRequired<AAResultsWrapperPass>();
1945     AU.addRequired<LoopAccessLegacyAnalysis>();
1946     AU.addRequired<DemandedBitsWrapperPass>();
1947     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1948     AU.addRequired<InjectTLIMappingsLegacy>();
1949 
1950     // We currently do not preserve loopinfo/dominator analyses with outer loop
1951     // vectorization. Until this is addressed, mark these analyses as preserved
1952     // only for non-VPlan-native path.
1953     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1954     if (!EnableVPlanNativePath) {
1955       AU.addPreserved<LoopInfoWrapperPass>();
1956       AU.addPreserved<DominatorTreeWrapperPass>();
1957     }
1958 
1959     AU.addPreserved<BasicAAWrapperPass>();
1960     AU.addPreserved<GlobalsAAWrapperPass>();
1961     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1962   }
1963 };
1964 
1965 } // end anonymous namespace
1966 
1967 //===----------------------------------------------------------------------===//
1968 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1969 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1970 //===----------------------------------------------------------------------===//
1971 
1972 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1973   // We need to place the broadcast of invariant variables outside the loop,
1974   // but only if it's proven safe to do so. Else, broadcast will be inside
1975   // vector loop body.
1976   Instruction *Instr = dyn_cast<Instruction>(V);
1977   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1978                      (!Instr ||
1979                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1980   // Place the code for broadcasting invariant variables in the new preheader.
1981   IRBuilder<>::InsertPointGuard Guard(Builder);
1982   if (SafeToHoist)
1983     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1984 
1985   // Broadcast the scalar into all locations in the vector.
1986   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1987 
1988   return Shuf;
1989 }
1990 
1991 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1992     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1993   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1994          "Expected either an induction phi-node or a truncate of it!");
1995   Value *Start = II.getStartValue();
1996 
1997   // Construct the initial value of the vector IV in the vector loop preheader
1998   auto CurrIP = Builder.saveIP();
1999   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2000   if (isa<TruncInst>(EntryVal)) {
2001     assert(Start->getType()->isIntegerTy() &&
2002            "Truncation requires an integer type");
2003     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2004     Step = Builder.CreateTrunc(Step, TruncType);
2005     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2006   }
2007   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2008   Value *SteppedStart =
2009       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2010 
2011   // We create vector phi nodes for both integer and floating-point induction
2012   // variables. Here, we determine the kind of arithmetic we will perform.
2013   Instruction::BinaryOps AddOp;
2014   Instruction::BinaryOps MulOp;
2015   if (Step->getType()->isIntegerTy()) {
2016     AddOp = Instruction::Add;
2017     MulOp = Instruction::Mul;
2018   } else {
2019     AddOp = II.getInductionOpcode();
2020     MulOp = Instruction::FMul;
2021   }
2022 
2023   // Multiply the vectorization factor by the step using integer or
2024   // floating-point arithmetic as appropriate.
2025   Value *ConstVF =
2026       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
2027   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2028 
2029   // Create a vector splat to use in the induction update.
2030   //
2031   // FIXME: If the step is non-constant, we create the vector splat with
2032   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2033   //        handle a constant vector splat.
2034   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2035   Value *SplatVF = isa<Constant>(Mul)
2036                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2037                        : Builder.CreateVectorSplat(VF, Mul);
2038   Builder.restoreIP(CurrIP);
2039 
2040   // We may need to add the step a number of times, depending on the unroll
2041   // factor. The last of those goes into the PHI.
2042   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2043                                     &*LoopVectorBody->getFirstInsertionPt());
2044   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2045   Instruction *LastInduction = VecInd;
2046   for (unsigned Part = 0; Part < UF; ++Part) {
2047     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2048 
2049     if (isa<TruncInst>(EntryVal))
2050       addMetadata(LastInduction, EntryVal);
2051     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
2052 
2053     LastInduction = cast<Instruction>(addFastMathFlag(
2054         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2055     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2056   }
2057 
2058   // Move the last step to the end of the latch block. This ensures consistent
2059   // placement of all induction updates.
2060   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2061   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2062   auto *ICmp = cast<Instruction>(Br->getCondition());
2063   LastInduction->moveBefore(ICmp);
2064   LastInduction->setName("vec.ind.next");
2065 
2066   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2067   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2068 }
2069 
2070 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2071   return Cost->isScalarAfterVectorization(I, VF) ||
2072          Cost->isProfitableToScalarize(I, VF);
2073 }
2074 
2075 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2076   if (shouldScalarizeInstruction(IV))
2077     return true;
2078   auto isScalarInst = [&](User *U) -> bool {
2079     auto *I = cast<Instruction>(U);
2080     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2081   };
2082   return llvm::any_of(IV->users(), isScalarInst);
2083 }
2084 
2085 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2086     const InductionDescriptor &ID, const Instruction *EntryVal,
2087     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
2088   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2089          "Expected either an induction phi-node or a truncate of it!");
2090 
2091   // This induction variable is not the phi from the original loop but the
2092   // newly-created IV based on the proof that casted Phi is equal to the
2093   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2094   // re-uses the same InductionDescriptor that original IV uses but we don't
2095   // have to do any recording in this case - that is done when original IV is
2096   // processed.
2097   if (isa<TruncInst>(EntryVal))
2098     return;
2099 
2100   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2101   if (Casts.empty())
2102     return;
2103   // Only the first Cast instruction in the Casts vector is of interest.
2104   // The rest of the Casts (if exist) have no uses outside the
2105   // induction update chain itself.
2106   Instruction *CastInst = *Casts.begin();
2107   if (Lane < UINT_MAX)
2108     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
2109   else
2110     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
2111 }
2112 
2113 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
2114   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2115          "Primary induction variable must have an integer type");
2116 
2117   auto II = Legal->getInductionVars().find(IV);
2118   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2119 
2120   auto ID = II->second;
2121   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2122 
2123   // The value from the original loop to which we are mapping the new induction
2124   // variable.
2125   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2126 
2127   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2128 
2129   // Generate code for the induction step. Note that induction steps are
2130   // required to be loop-invariant
2131   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2132     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2133            "Induction step should be loop invariant");
2134     if (PSE.getSE()->isSCEVable(IV->getType())) {
2135       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2136       return Exp.expandCodeFor(Step, Step->getType(),
2137                                LoopVectorPreHeader->getTerminator());
2138     }
2139     return cast<SCEVUnknown>(Step)->getValue();
2140   };
2141 
2142   // The scalar value to broadcast. This is derived from the canonical
2143   // induction variable. If a truncation type is given, truncate the canonical
2144   // induction variable and step. Otherwise, derive these values from the
2145   // induction descriptor.
2146   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2147     Value *ScalarIV = Induction;
2148     if (IV != OldInduction) {
2149       ScalarIV = IV->getType()->isIntegerTy()
2150                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2151                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2152                                           IV->getType());
2153       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2154       ScalarIV->setName("offset.idx");
2155     }
2156     if (Trunc) {
2157       auto *TruncType = cast<IntegerType>(Trunc->getType());
2158       assert(Step->getType()->isIntegerTy() &&
2159              "Truncation requires an integer step");
2160       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2161       Step = Builder.CreateTrunc(Step, TruncType);
2162     }
2163     return ScalarIV;
2164   };
2165 
2166   // Create the vector values from the scalar IV, in the absence of creating a
2167   // vector IV.
2168   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2169     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2170     for (unsigned Part = 0; Part < UF; ++Part) {
2171       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2172       Value *EntryPart =
2173           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2174                         ID.getInductionOpcode());
2175       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2176       if (Trunc)
2177         addMetadata(EntryPart, Trunc);
2178       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2179     }
2180   };
2181 
2182   // Now do the actual transformations, and start with creating the step value.
2183   Value *Step = CreateStepValue(ID.getStep());
2184   if (VF.isZero() || VF.isScalar()) {
2185     Value *ScalarIV = CreateScalarIV(Step);
2186     CreateSplatIV(ScalarIV, Step);
2187     return;
2188   }
2189 
2190   // Determine if we want a scalar version of the induction variable. This is
2191   // true if the induction variable itself is not widened, or if it has at
2192   // least one user in the loop that is not widened.
2193   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2194   if (!NeedsScalarIV) {
2195     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2196     return;
2197   }
2198 
2199   // Try to create a new independent vector induction variable. If we can't
2200   // create the phi node, we will splat the scalar induction variable in each
2201   // loop iteration.
2202   if (!shouldScalarizeInstruction(EntryVal)) {
2203     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2204     Value *ScalarIV = CreateScalarIV(Step);
2205     // Create scalar steps that can be used by instructions we will later
2206     // scalarize. Note that the addition of the scalar steps will not increase
2207     // the number of instructions in the loop in the common case prior to
2208     // InstCombine. We will be trading one vector extract for each scalar step.
2209     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2210     return;
2211   }
2212 
2213   // All IV users are scalar instructions, so only emit a scalar IV, not a
2214   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2215   // predicate used by the masked loads/stores.
2216   Value *ScalarIV = CreateScalarIV(Step);
2217   if (!Cost->isScalarEpilogueAllowed())
2218     CreateSplatIV(ScalarIV, Step);
2219   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2220 }
2221 
2222 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2223                                           Instruction::BinaryOps BinOp) {
2224   // Create and check the types.
2225   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2226   int VLen = ValVTy->getNumElements();
2227 
2228   Type *STy = Val->getType()->getScalarType();
2229   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2230          "Induction Step must be an integer or FP");
2231   assert(Step->getType() == STy && "Step has wrong type");
2232 
2233   SmallVector<Constant *, 8> Indices;
2234 
2235   if (STy->isIntegerTy()) {
2236     // Create a vector of consecutive numbers from zero to VF.
2237     for (int i = 0; i < VLen; ++i)
2238       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2239 
2240     // Add the consecutive indices to the vector value.
2241     Constant *Cv = ConstantVector::get(Indices);
2242     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2243     Step = Builder.CreateVectorSplat(VLen, Step);
2244     assert(Step->getType() == Val->getType() && "Invalid step vec");
2245     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2246     // which can be found from the original scalar operations.
2247     Step = Builder.CreateMul(Cv, Step);
2248     return Builder.CreateAdd(Val, Step, "induction");
2249   }
2250 
2251   // Floating point induction.
2252   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2253          "Binary Opcode should be specified for FP induction");
2254   // Create a vector of consecutive numbers from zero to VF.
2255   for (int i = 0; i < VLen; ++i)
2256     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2257 
2258   // Add the consecutive indices to the vector value.
2259   Constant *Cv = ConstantVector::get(Indices);
2260 
2261   Step = Builder.CreateVectorSplat(VLen, Step);
2262 
2263   // Floating point operations had to be 'fast' to enable the induction.
2264   FastMathFlags Flags;
2265   Flags.setFast();
2266 
2267   Value *MulOp = Builder.CreateFMul(Cv, Step);
2268   if (isa<Instruction>(MulOp))
2269     // Have to check, MulOp may be a constant
2270     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2271 
2272   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2273   if (isa<Instruction>(BOp))
2274     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2275   return BOp;
2276 }
2277 
2278 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2279                                            Instruction *EntryVal,
2280                                            const InductionDescriptor &ID) {
2281   // We shouldn't have to build scalar steps if we aren't vectorizing.
2282   assert(VF.isVector() && "VF should be greater than one");
2283   assert(!VF.isScalable() &&
2284          "the code below assumes a fixed number of elements at compile time");
2285   // Get the value type and ensure it and the step have the same integer type.
2286   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2287   assert(ScalarIVTy == Step->getType() &&
2288          "Val and Step should have the same type");
2289 
2290   // We build scalar steps for both integer and floating-point induction
2291   // variables. Here, we determine the kind of arithmetic we will perform.
2292   Instruction::BinaryOps AddOp;
2293   Instruction::BinaryOps MulOp;
2294   if (ScalarIVTy->isIntegerTy()) {
2295     AddOp = Instruction::Add;
2296     MulOp = Instruction::Mul;
2297   } else {
2298     AddOp = ID.getInductionOpcode();
2299     MulOp = Instruction::FMul;
2300   }
2301 
2302   // Determine the number of scalars we need to generate for each unroll
2303   // iteration. If EntryVal is uniform, we only need to generate the first
2304   // lane. Otherwise, we generate all VF values.
2305   unsigned Lanes =
2306       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2307           ? 1
2308           : VF.getKnownMinValue();
2309   // Compute the scalar steps and save the results in VectorLoopValueMap.
2310   for (unsigned Part = 0; Part < UF; ++Part) {
2311     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2312       auto *StartIdx = getSignedIntOrFpConstant(
2313           ScalarIVTy, VF.getKnownMinValue() * Part + Lane);
2314       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2315       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2316       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2317       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2318     }
2319   }
2320 }
2321 
2322 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2323   assert(V != Induction && "The new induction variable should not be used.");
2324   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2325   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2326 
2327   // If we have a stride that is replaced by one, do it here. Defer this for
2328   // the VPlan-native path until we start running Legal checks in that path.
2329   if (!EnableVPlanNativePath && Legal->hasStride(V))
2330     V = ConstantInt::get(V->getType(), 1);
2331 
2332   // If we have a vector mapped to this value, return it.
2333   if (VectorLoopValueMap.hasVectorValue(V, Part))
2334     return VectorLoopValueMap.getVectorValue(V, Part);
2335 
2336   // If the value has not been vectorized, check if it has been scalarized
2337   // instead. If it has been scalarized, and we actually need the value in
2338   // vector form, we will construct the vector values on demand.
2339   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2340     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2341 
2342     // If we've scalarized a value, that value should be an instruction.
2343     auto *I = cast<Instruction>(V);
2344 
2345     // If we aren't vectorizing, we can just copy the scalar map values over to
2346     // the vector map.
2347     if (VF.isScalar()) {
2348       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2349       return ScalarValue;
2350     }
2351 
2352     // Get the last scalar instruction we generated for V and Part. If the value
2353     // is known to be uniform after vectorization, this corresponds to lane zero
2354     // of the Part unroll iteration. Otherwise, the last instruction is the one
2355     // we created for the last vector lane of the Part unroll iteration.
2356     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2357     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2358                             ? 0
2359                             : VF.getKnownMinValue() - 1;
2360     auto *LastInst = cast<Instruction>(
2361         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2362 
2363     // Set the insert point after the last scalarized instruction. This ensures
2364     // the insertelement sequence will directly follow the scalar definitions.
2365     auto OldIP = Builder.saveIP();
2366     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2367     Builder.SetInsertPoint(&*NewIP);
2368 
2369     // However, if we are vectorizing, we need to construct the vector values.
2370     // If the value is known to be uniform after vectorization, we can just
2371     // broadcast the scalar value corresponding to lane zero for each unroll
2372     // iteration. Otherwise, we construct the vector values using insertelement
2373     // instructions. Since the resulting vectors are stored in
2374     // VectorLoopValueMap, we will only generate the insertelements once.
2375     Value *VectorValue = nullptr;
2376     if (Cost->isUniformAfterVectorization(I, VF)) {
2377       VectorValue = getBroadcastInstrs(ScalarValue);
2378       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2379     } else {
2380       // Initialize packing with insertelements to start from undef.
2381       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2382       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2383       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2384       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2385         packScalarIntoVectorValue(V, {Part, Lane});
2386       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2387     }
2388     Builder.restoreIP(OldIP);
2389     return VectorValue;
2390   }
2391 
2392   // If this scalar is unknown, assume that it is a constant or that it is
2393   // loop invariant. Broadcast V and save the value for future uses.
2394   Value *B = getBroadcastInstrs(V);
2395   VectorLoopValueMap.setVectorValue(V, Part, B);
2396   return B;
2397 }
2398 
2399 Value *
2400 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2401                                             const VPIteration &Instance) {
2402   // If the value is not an instruction contained in the loop, it should
2403   // already be scalar.
2404   if (OrigLoop->isLoopInvariant(V))
2405     return V;
2406 
2407   assert(Instance.Lane > 0
2408              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2409              : true && "Uniform values only have lane zero");
2410 
2411   // If the value from the original loop has not been vectorized, it is
2412   // represented by UF x VF scalar values in the new loop. Return the requested
2413   // scalar value.
2414   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2415     return VectorLoopValueMap.getScalarValue(V, Instance);
2416 
2417   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2418   // for the given unroll part. If this entry is not a vector type (i.e., the
2419   // vectorization factor is one), there is no need to generate an
2420   // extractelement instruction.
2421   auto *U = getOrCreateVectorValue(V, Instance.Part);
2422   if (!U->getType()->isVectorTy()) {
2423     assert(VF.isScalar() && "Value not scalarized has non-vector type");
2424     return U;
2425   }
2426 
2427   // Otherwise, the value from the original loop has been vectorized and is
2428   // represented by UF vector values. Extract and return the requested scalar
2429   // value from the appropriate vector lane.
2430   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2431 }
2432 
2433 void InnerLoopVectorizer::packScalarIntoVectorValue(
2434     Value *V, const VPIteration &Instance) {
2435   assert(V != Induction && "The new induction variable should not be used.");
2436   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2437   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2438 
2439   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2440   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2441   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2442                                             Builder.getInt32(Instance.Lane));
2443   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2444 }
2445 
2446 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2447   assert(Vec->getType()->isVectorTy() && "Invalid type");
2448   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2449   SmallVector<int, 8> ShuffleMask;
2450   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2451     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2452 
2453   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2454 }
2455 
2456 // Return whether we allow using masked interleave-groups (for dealing with
2457 // strided loads/stores that reside in predicated blocks, or for dealing
2458 // with gaps).
2459 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2460   // If an override option has been passed in for interleaved accesses, use it.
2461   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2462     return EnableMaskedInterleavedMemAccesses;
2463 
2464   return TTI.enableMaskedInterleavedAccessVectorization();
2465 }
2466 
2467 // Try to vectorize the interleave group that \p Instr belongs to.
2468 //
2469 // E.g. Translate following interleaved load group (factor = 3):
2470 //   for (i = 0; i < N; i+=3) {
2471 //     R = Pic[i];             // Member of index 0
2472 //     G = Pic[i+1];           // Member of index 1
2473 //     B = Pic[i+2];           // Member of index 2
2474 //     ... // do something to R, G, B
2475 //   }
2476 // To:
2477 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2478 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2479 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2480 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2481 //
2482 // Or translate following interleaved store group (factor = 3):
2483 //   for (i = 0; i < N; i+=3) {
2484 //     ... do something to R, G, B
2485 //     Pic[i]   = R;           // Member of index 0
2486 //     Pic[i+1] = G;           // Member of index 1
2487 //     Pic[i+2] = B;           // Member of index 2
2488 //   }
2489 // To:
2490 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2491 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2492 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2493 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2494 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2495 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2496     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2497     VPValue *Addr, ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask) {
2498   Instruction *Instr = Group->getInsertPos();
2499   const DataLayout &DL = Instr->getModule()->getDataLayout();
2500 
2501   // Prepare for the vector type of the interleaved load/store.
2502   Type *ScalarTy = getMemInstValueType(Instr);
2503   unsigned InterleaveFactor = Group->getFactor();
2504   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2505   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2506 
2507   // Prepare for the new pointers.
2508   SmallVector<Value *, 2> AddrParts;
2509   unsigned Index = Group->getIndex(Instr);
2510 
2511   // TODO: extend the masked interleaved-group support to reversed access.
2512   assert((!BlockInMask || !Group->isReverse()) &&
2513          "Reversed masked interleave-group not supported.");
2514 
2515   // If the group is reverse, adjust the index to refer to the last vector lane
2516   // instead of the first. We adjust the index from the first vector lane,
2517   // rather than directly getting the pointer for lane VF - 1, because the
2518   // pointer operand of the interleaved access is supposed to be uniform. For
2519   // uniform instructions, we're only required to generate a value for the
2520   // first vector lane in each unroll iteration.
2521   assert(!VF.isScalable() &&
2522          "scalable vector reverse operation is not implemented");
2523   if (Group->isReverse())
2524     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2525 
2526   for (unsigned Part = 0; Part < UF; Part++) {
2527     Value *AddrPart = State.get(Addr, {Part, 0});
2528     setDebugLocFromInst(Builder, AddrPart);
2529 
2530     // Notice current instruction could be any index. Need to adjust the address
2531     // to the member of index 0.
2532     //
2533     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2534     //       b = A[i];       // Member of index 0
2535     // Current pointer is pointed to A[i+1], adjust it to A[i].
2536     //
2537     // E.g.  A[i+1] = a;     // Member of index 1
2538     //       A[i]   = b;     // Member of index 0
2539     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2540     // Current pointer is pointed to A[i+2], adjust it to A[i].
2541 
2542     bool InBounds = false;
2543     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2544       InBounds = gep->isInBounds();
2545     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2546     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2547 
2548     // Cast to the vector pointer type.
2549     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2550     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2551     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2552   }
2553 
2554   setDebugLocFromInst(Builder, Instr);
2555   Value *UndefVec = UndefValue::get(VecTy);
2556 
2557   Value *MaskForGaps = nullptr;
2558   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2559     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2560     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2561     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2562   }
2563 
2564   // Vectorize the interleaved load group.
2565   if (isa<LoadInst>(Instr)) {
2566     // For each unroll part, create a wide load for the group.
2567     SmallVector<Value *, 2> NewLoads;
2568     for (unsigned Part = 0; Part < UF; Part++) {
2569       Instruction *NewLoad;
2570       if (BlockInMask || MaskForGaps) {
2571         assert(useMaskedInterleavedAccesses(*TTI) &&
2572                "masked interleaved groups are not allowed.");
2573         Value *GroupMask = MaskForGaps;
2574         if (BlockInMask) {
2575           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2576           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2577           Value *ShuffledMask = Builder.CreateShuffleVector(
2578               BlockInMaskPart,
2579               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2580               "interleaved.mask");
2581           GroupMask = MaskForGaps
2582                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2583                                                 MaskForGaps)
2584                           : ShuffledMask;
2585         }
2586         NewLoad =
2587             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2588                                      GroupMask, UndefVec, "wide.masked.vec");
2589       }
2590       else
2591         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2592                                             Group->getAlign(), "wide.vec");
2593       Group->addMetadata(NewLoad);
2594       NewLoads.push_back(NewLoad);
2595     }
2596 
2597     // For each member in the group, shuffle out the appropriate data from the
2598     // wide loads.
2599     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2600       Instruction *Member = Group->getMember(I);
2601 
2602       // Skip the gaps in the group.
2603       if (!Member)
2604         continue;
2605 
2606       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2607       auto StrideMask =
2608           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2609       for (unsigned Part = 0; Part < UF; Part++) {
2610         Value *StridedVec = Builder.CreateShuffleVector(
2611             NewLoads[Part], StrideMask, "strided.vec");
2612 
2613         // If this member has different type, cast the result type.
2614         if (Member->getType() != ScalarTy) {
2615           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2616           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2617           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2618         }
2619 
2620         if (Group->isReverse())
2621           StridedVec = reverseVector(StridedVec);
2622 
2623         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2624       }
2625     }
2626     return;
2627   }
2628 
2629   // The sub vector type for current instruction.
2630   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2631   auto *SubVT = VectorType::get(ScalarTy, VF);
2632 
2633   // Vectorize the interleaved store group.
2634   for (unsigned Part = 0; Part < UF; Part++) {
2635     // Collect the stored vector from each member.
2636     SmallVector<Value *, 4> StoredVecs;
2637     for (unsigned i = 0; i < InterleaveFactor; i++) {
2638       // Interleaved store group doesn't allow a gap, so each index has a member
2639       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2640 
2641       Value *StoredVec = State.get(StoredValues[i], Part);
2642 
2643       if (Group->isReverse())
2644         StoredVec = reverseVector(StoredVec);
2645 
2646       // If this member has different type, cast it to a unified type.
2647 
2648       if (StoredVec->getType() != SubVT)
2649         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2650 
2651       StoredVecs.push_back(StoredVec);
2652     }
2653 
2654     // Concatenate all vectors into a wide vector.
2655     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2656 
2657     // Interleave the elements in the wide vector.
2658     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2659     Value *IVec = Builder.CreateShuffleVector(
2660         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2661         "interleaved.vec");
2662 
2663     Instruction *NewStoreInstr;
2664     if (BlockInMask) {
2665       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2666       Value *ShuffledMask = Builder.CreateShuffleVector(
2667           BlockInMaskPart,
2668           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2669           "interleaved.mask");
2670       NewStoreInstr = Builder.CreateMaskedStore(
2671           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2672     }
2673     else
2674       NewStoreInstr =
2675           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2676 
2677     Group->addMetadata(NewStoreInstr);
2678   }
2679 }
2680 
2681 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2682     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2683     VPValue *StoredValue, VPValue *BlockInMask) {
2684   // Attempt to issue a wide load.
2685   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2686   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2687 
2688   assert((LI || SI) && "Invalid Load/Store instruction");
2689   assert((!SI || StoredValue) && "No stored value provided for widened store");
2690   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2691 
2692   LoopVectorizationCostModel::InstWidening Decision =
2693       Cost->getWideningDecision(Instr, VF);
2694   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2695           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2696           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2697          "CM decision is not to widen the memory instruction");
2698 
2699   Type *ScalarDataTy = getMemInstValueType(Instr);
2700 
2701   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2702   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2703   const Align Alignment = getLoadStoreAlignment(Instr);
2704 
2705   // Determine if the pointer operand of the access is either consecutive or
2706   // reverse consecutive.
2707   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2708   bool ConsecutiveStride =
2709       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2710   bool CreateGatherScatter =
2711       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2712 
2713   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2714   // gather/scatter. Otherwise Decision should have been to Scalarize.
2715   assert((ConsecutiveStride || CreateGatherScatter) &&
2716          "The instruction should be scalarized");
2717   (void)ConsecutiveStride;
2718 
2719   VectorParts BlockInMaskParts(UF);
2720   bool isMaskRequired = BlockInMask;
2721   if (isMaskRequired)
2722     for (unsigned Part = 0; Part < UF; ++Part)
2723       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2724 
2725   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2726     // Calculate the pointer for the specific unroll-part.
2727     GetElementPtrInst *PartPtr = nullptr;
2728 
2729     bool InBounds = false;
2730     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2731       InBounds = gep->isInBounds();
2732 
2733     if (Reverse) {
2734       // If the address is consecutive but reversed, then the
2735       // wide store needs to start at the last vector element.
2736       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2737           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2738       PartPtr->setIsInBounds(InBounds);
2739       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2740           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2741       PartPtr->setIsInBounds(InBounds);
2742       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2743         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2744     } else {
2745       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2746           ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));
2747       PartPtr->setIsInBounds(InBounds);
2748     }
2749 
2750     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2751     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2752   };
2753 
2754   // Handle Stores:
2755   if (SI) {
2756     setDebugLocFromInst(Builder, SI);
2757 
2758     for (unsigned Part = 0; Part < UF; ++Part) {
2759       Instruction *NewSI = nullptr;
2760       Value *StoredVal = State.get(StoredValue, Part);
2761       if (CreateGatherScatter) {
2762         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2763         Value *VectorGep = State.get(Addr, Part);
2764         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2765                                             MaskPart);
2766       } else {
2767         if (Reverse) {
2768           // If we store to reverse consecutive memory locations, then we need
2769           // to reverse the order of elements in the stored value.
2770           StoredVal = reverseVector(StoredVal);
2771           // We don't want to update the value in the map as it might be used in
2772           // another expression. So don't call resetVectorValue(StoredVal).
2773         }
2774         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2775         if (isMaskRequired)
2776           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2777                                             BlockInMaskParts[Part]);
2778         else
2779           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2780       }
2781       addMetadata(NewSI, SI);
2782     }
2783     return;
2784   }
2785 
2786   // Handle loads.
2787   assert(LI && "Must have a load instruction");
2788   setDebugLocFromInst(Builder, LI);
2789   for (unsigned Part = 0; Part < UF; ++Part) {
2790     Value *NewLI;
2791     if (CreateGatherScatter) {
2792       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2793       Value *VectorGep = State.get(Addr, Part);
2794       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2795                                          nullptr, "wide.masked.gather");
2796       addMetadata(NewLI, LI);
2797     } else {
2798       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2799       if (isMaskRequired)
2800         NewLI = Builder.CreateMaskedLoad(
2801             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2802             "wide.masked.load");
2803       else
2804         NewLI =
2805             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2806 
2807       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2808       addMetadata(NewLI, LI);
2809       if (Reverse)
2810         NewLI = reverseVector(NewLI);
2811     }
2812 
2813     State.set(Def, Instr, NewLI, Part);
2814   }
2815 }
2816 
2817 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2818                                                const VPIteration &Instance,
2819                                                bool IfPredicateInstr,
2820                                                VPTransformState &State) {
2821   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2822 
2823   setDebugLocFromInst(Builder, Instr);
2824 
2825   // Does this instruction return a value ?
2826   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2827 
2828   Instruction *Cloned = Instr->clone();
2829   if (!IsVoidRetTy)
2830     Cloned->setName(Instr->getName() + ".cloned");
2831 
2832   // Replace the operands of the cloned instructions with their scalar
2833   // equivalents in the new loop.
2834   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2835     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2836     auto InputInstance = Instance;
2837     if (!Operand || !OrigLoop->contains(Operand) ||
2838         (Cost->isUniformAfterVectorization(Operand, State.VF)))
2839       InputInstance.Lane = 0;
2840     auto *NewOp = State.get(User.getOperand(op), InputInstance);
2841     Cloned->setOperand(op, NewOp);
2842   }
2843   addNewMetadata(Cloned, Instr);
2844 
2845   // Place the cloned scalar in the new loop.
2846   Builder.Insert(Cloned);
2847 
2848   // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
2849   // representing scalar values in VPTransformState. Add the cloned scalar to
2850   // the scalar map entry.
2851   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2852 
2853   // If we just cloned a new assumption, add it the assumption cache.
2854   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2855     if (II->getIntrinsicID() == Intrinsic::assume)
2856       AC->registerAssumption(II);
2857 
2858   // End if-block.
2859   if (IfPredicateInstr)
2860     PredicatedInstructions.push_back(Cloned);
2861 }
2862 
2863 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2864                                                       Value *End, Value *Step,
2865                                                       Instruction *DL) {
2866   BasicBlock *Header = L->getHeader();
2867   BasicBlock *Latch = L->getLoopLatch();
2868   // As we're just creating this loop, it's possible no latch exists
2869   // yet. If so, use the header as this will be a single block loop.
2870   if (!Latch)
2871     Latch = Header;
2872 
2873   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2874   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2875   setDebugLocFromInst(Builder, OldInst);
2876   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2877 
2878   Builder.SetInsertPoint(Latch->getTerminator());
2879   setDebugLocFromInst(Builder, OldInst);
2880 
2881   // Create i+1 and fill the PHINode.
2882   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2883   Induction->addIncoming(Start, L->getLoopPreheader());
2884   Induction->addIncoming(Next, Latch);
2885   // Create the compare.
2886   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2887   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2888 
2889   // Now we have two terminators. Remove the old one from the block.
2890   Latch->getTerminator()->eraseFromParent();
2891 
2892   return Induction;
2893 }
2894 
2895 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2896   if (TripCount)
2897     return TripCount;
2898 
2899   assert(L && "Create Trip Count for null loop.");
2900   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2901   // Find the loop boundaries.
2902   ScalarEvolution *SE = PSE.getSE();
2903   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2904   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2905          "Invalid loop count");
2906 
2907   Type *IdxTy = Legal->getWidestInductionType();
2908   assert(IdxTy && "No type for induction");
2909 
2910   // The exit count might have the type of i64 while the phi is i32. This can
2911   // happen if we have an induction variable that is sign extended before the
2912   // compare. The only way that we get a backedge taken count is that the
2913   // induction variable was signed and as such will not overflow. In such a case
2914   // truncation is legal.
2915   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2916       IdxTy->getPrimitiveSizeInBits())
2917     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2918   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2919 
2920   // Get the total trip count from the count by adding 1.
2921   const SCEV *ExitCount = SE->getAddExpr(
2922       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2923 
2924   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2925 
2926   // Expand the trip count and place the new instructions in the preheader.
2927   // Notice that the pre-header does not change, only the loop body.
2928   SCEVExpander Exp(*SE, DL, "induction");
2929 
2930   // Count holds the overall loop count (N).
2931   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2932                                 L->getLoopPreheader()->getTerminator());
2933 
2934   if (TripCount->getType()->isPointerTy())
2935     TripCount =
2936         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2937                                     L->getLoopPreheader()->getTerminator());
2938 
2939   return TripCount;
2940 }
2941 
2942 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2943   if (VectorTripCount)
2944     return VectorTripCount;
2945 
2946   Value *TC = getOrCreateTripCount(L);
2947   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2948 
2949   Type *Ty = TC->getType();
2950   // This is where we can make the step a runtime constant.
2951   assert(!VF.isScalable() && "scalable vectorization is not supported yet");
2952   Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF);
2953 
2954   // If the tail is to be folded by masking, round the number of iterations N
2955   // up to a multiple of Step instead of rounding down. This is done by first
2956   // adding Step-1 and then rounding down. Note that it's ok if this addition
2957   // overflows: the vector induction variable will eventually wrap to zero given
2958   // that it starts at zero and its Step is a power of two; the loop will then
2959   // exit, with the last early-exit vector comparison also producing all-true.
2960   if (Cost->foldTailByMasking()) {
2961     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2962            "VF*UF must be a power of 2 when folding tail by masking");
2963     TC = Builder.CreateAdd(
2964         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2965   }
2966 
2967   // Now we need to generate the expression for the part of the loop that the
2968   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2969   // iterations are not required for correctness, or N - Step, otherwise. Step
2970   // is equal to the vectorization factor (number of SIMD elements) times the
2971   // unroll factor (number of SIMD instructions).
2972   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2973 
2974   // If there is a non-reversed interleaved group that may speculatively access
2975   // memory out-of-bounds, we need to ensure that there will be at least one
2976   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2977   // the trip count, we set the remainder to be equal to the step. If the step
2978   // does not evenly divide the trip count, no adjustment is necessary since
2979   // there will already be scalar iterations. Note that the minimum iterations
2980   // check ensures that N >= Step.
2981   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
2982     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2983     R = Builder.CreateSelect(IsZero, Step, R);
2984   }
2985 
2986   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2987 
2988   return VectorTripCount;
2989 }
2990 
2991 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2992                                                    const DataLayout &DL) {
2993   // Verify that V is a vector type with same number of elements as DstVTy.
2994   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2995   unsigned VF = DstFVTy->getNumElements();
2996   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2997   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2998   Type *SrcElemTy = SrcVecTy->getElementType();
2999   Type *DstElemTy = DstFVTy->getElementType();
3000   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3001          "Vector elements must have same size");
3002 
3003   // Do a direct cast if element types are castable.
3004   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3005     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3006   }
3007   // V cannot be directly casted to desired vector type.
3008   // May happen when V is a floating point vector but DstVTy is a vector of
3009   // pointers or vice-versa. Handle this using a two-step bitcast using an
3010   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3011   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3012          "Only one type should be a pointer type");
3013   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3014          "Only one type should be a floating point type");
3015   Type *IntTy =
3016       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3017   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3018   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3019   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3020 }
3021 
3022 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3023                                                          BasicBlock *Bypass) {
3024   Value *Count = getOrCreateTripCount(L);
3025   // Reuse existing vector loop preheader for TC checks.
3026   // Note that new preheader block is generated for vector loop.
3027   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3028   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3029 
3030   // Generate code to check if the loop's trip count is less than VF * UF, or
3031   // equal to it in case a scalar epilogue is required; this implies that the
3032   // vector trip count is zero. This check also covers the case where adding one
3033   // to the backedge-taken count overflowed leading to an incorrect trip count
3034   // of zero. In this case we will also jump to the scalar loop.
3035   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3036                                           : ICmpInst::ICMP_ULT;
3037 
3038   // If tail is to be folded, vector loop takes care of all iterations.
3039   Value *CheckMinIters = Builder.getFalse();
3040   if (!Cost->foldTailByMasking()) {
3041     assert(!VF.isScalable() && "scalable vectors not yet supported.");
3042     CheckMinIters = Builder.CreateICmp(
3043         P, Count,
3044         ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
3045         "min.iters.check");
3046   }
3047   // Create new preheader for vector loop.
3048   LoopVectorPreHeader =
3049       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3050                  "vector.ph");
3051 
3052   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3053                                DT->getNode(Bypass)->getIDom()) &&
3054          "TC check is expected to dominate Bypass");
3055 
3056   // Update dominator for Bypass & LoopExit.
3057   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3058   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3059 
3060   ReplaceInstWithInst(
3061       TCCheckBlock->getTerminator(),
3062       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3063   LoopBypassBlocks.push_back(TCCheckBlock);
3064 }
3065 
3066 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3067   // Reuse existing vector loop preheader for SCEV checks.
3068   // Note that new preheader block is generated for vector loop.
3069   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
3070 
3071   // Generate the code to check that the SCEV assumptions that we made.
3072   // We want the new basic block to start at the first instruction in a
3073   // sequence of instructions that form a check.
3074   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3075                    "scev.check");
3076   Value *SCEVCheck = Exp.expandCodeForPredicate(
3077       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
3078 
3079   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3080     if (C->isZero())
3081       return;
3082 
3083   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3084            (OptForSizeBasedOnProfile &&
3085             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3086          "Cannot SCEV check stride or overflow when optimizing for size");
3087 
3088   SCEVCheckBlock->setName("vector.scevcheck");
3089   // Create new preheader for vector loop.
3090   LoopVectorPreHeader =
3091       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
3092                  nullptr, "vector.ph");
3093 
3094   // Update dominator only if this is first RT check.
3095   if (LoopBypassBlocks.empty()) {
3096     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3097     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3098   }
3099 
3100   ReplaceInstWithInst(
3101       SCEVCheckBlock->getTerminator(),
3102       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
3103   LoopBypassBlocks.push_back(SCEVCheckBlock);
3104   AddedSafetyChecks = true;
3105 }
3106 
3107 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3108   // VPlan-native path does not do any analysis for runtime checks currently.
3109   if (EnableVPlanNativePath)
3110     return;
3111 
3112   // Reuse existing vector loop preheader for runtime memory checks.
3113   // Note that new preheader block is generated for vector loop.
3114   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
3115 
3116   // Generate the code that checks in runtime if arrays overlap. We put the
3117   // checks into a separate block to make the more common case of few elements
3118   // faster.
3119   auto *LAI = Legal->getLAI();
3120   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
3121   if (!RtPtrChecking.Need)
3122     return;
3123 
3124   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3125     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3126            "Cannot emit memory checks when optimizing for size, unless forced "
3127            "to vectorize.");
3128     ORE->emit([&]() {
3129       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3130                                         L->getStartLoc(), L->getHeader())
3131              << "Code-size may be reduced by not forcing "
3132                 "vectorization, or by source-code modifications "
3133                 "eliminating the need for runtime checks "
3134                 "(e.g., adding 'restrict').";
3135     });
3136   }
3137 
3138   MemCheckBlock->setName("vector.memcheck");
3139   // Create new preheader for vector loop.
3140   LoopVectorPreHeader =
3141       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
3142                  "vector.ph");
3143 
3144   auto *CondBranch = cast<BranchInst>(
3145       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
3146   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
3147   LoopBypassBlocks.push_back(MemCheckBlock);
3148   AddedSafetyChecks = true;
3149 
3150   // Update dominator only if this is first RT check.
3151   if (LoopBypassBlocks.empty()) {
3152     DT->changeImmediateDominator(Bypass, MemCheckBlock);
3153     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
3154   }
3155 
3156   Instruction *FirstCheckInst;
3157   Instruction *MemRuntimeCheck;
3158   std::tie(FirstCheckInst, MemRuntimeCheck) =
3159       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
3160                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
3161   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
3162                             "claimed checks are required");
3163   CondBranch->setCondition(MemRuntimeCheck);
3164 
3165   // We currently don't use LoopVersioning for the actual loop cloning but we
3166   // still use it to add the noalias metadata.
3167   LVer = std::make_unique<LoopVersioning>(
3168       *Legal->getLAI(),
3169       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3170       DT, PSE.getSE());
3171   LVer->prepareNoAliasMetadata();
3172 }
3173 
3174 Value *InnerLoopVectorizer::emitTransformedIndex(
3175     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3176     const InductionDescriptor &ID) const {
3177 
3178   SCEVExpander Exp(*SE, DL, "induction");
3179   auto Step = ID.getStep();
3180   auto StartValue = ID.getStartValue();
3181   assert(Index->getType() == Step->getType() &&
3182          "Index type does not match StepValue type");
3183 
3184   // Note: the IR at this point is broken. We cannot use SE to create any new
3185   // SCEV and then expand it, hoping that SCEV's simplification will give us
3186   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3187   // lead to various SCEV crashes. So all we can do is to use builder and rely
3188   // on InstCombine for future simplifications. Here we handle some trivial
3189   // cases only.
3190   auto CreateAdd = [&B](Value *X, Value *Y) {
3191     assert(X->getType() == Y->getType() && "Types don't match!");
3192     if (auto *CX = dyn_cast<ConstantInt>(X))
3193       if (CX->isZero())
3194         return Y;
3195     if (auto *CY = dyn_cast<ConstantInt>(Y))
3196       if (CY->isZero())
3197         return X;
3198     return B.CreateAdd(X, Y);
3199   };
3200 
3201   auto CreateMul = [&B](Value *X, Value *Y) {
3202     assert(X->getType() == Y->getType() && "Types don't match!");
3203     if (auto *CX = dyn_cast<ConstantInt>(X))
3204       if (CX->isOne())
3205         return Y;
3206     if (auto *CY = dyn_cast<ConstantInt>(Y))
3207       if (CY->isOne())
3208         return X;
3209     return B.CreateMul(X, Y);
3210   };
3211 
3212   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3213   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3214   // the DomTree is not kept up-to-date for additional blocks generated in the
3215   // vector loop. By using the header as insertion point, we guarantee that the
3216   // expanded instructions dominate all their uses.
3217   auto GetInsertPoint = [this, &B]() {
3218     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3219     if (InsertBB != LoopVectorBody &&
3220         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3221       return LoopVectorBody->getTerminator();
3222     return &*B.GetInsertPoint();
3223   };
3224   switch (ID.getKind()) {
3225   case InductionDescriptor::IK_IntInduction: {
3226     assert(Index->getType() == StartValue->getType() &&
3227            "Index type does not match StartValue type");
3228     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3229       return B.CreateSub(StartValue, Index);
3230     auto *Offset = CreateMul(
3231         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3232     return CreateAdd(StartValue, Offset);
3233   }
3234   case InductionDescriptor::IK_PtrInduction: {
3235     assert(isa<SCEVConstant>(Step) &&
3236            "Expected constant step for pointer induction");
3237     return B.CreateGEP(
3238         StartValue->getType()->getPointerElementType(), StartValue,
3239         CreateMul(Index,
3240                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3241   }
3242   case InductionDescriptor::IK_FpInduction: {
3243     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3244     auto InductionBinOp = ID.getInductionBinOp();
3245     assert(InductionBinOp &&
3246            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3247             InductionBinOp->getOpcode() == Instruction::FSub) &&
3248            "Original bin op should be defined for FP induction");
3249 
3250     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3251 
3252     // Floating point operations had to be 'fast' to enable the induction.
3253     FastMathFlags Flags;
3254     Flags.setFast();
3255 
3256     Value *MulExp = B.CreateFMul(StepValue, Index);
3257     if (isa<Instruction>(MulExp))
3258       // We have to check, the MulExp may be a constant.
3259       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3260 
3261     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3262                                "induction");
3263     if (isa<Instruction>(BOp))
3264       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3265 
3266     return BOp;
3267   }
3268   case InductionDescriptor::IK_NoInduction:
3269     return nullptr;
3270   }
3271   llvm_unreachable("invalid enum");
3272 }
3273 
3274 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3275   LoopScalarBody = OrigLoop->getHeader();
3276   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3277   LoopExitBlock = OrigLoop->getExitBlock();
3278   assert(LoopExitBlock && "Must have an exit block");
3279   assert(LoopVectorPreHeader && "Invalid loop structure");
3280 
3281   LoopMiddleBlock =
3282       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3283                  LI, nullptr, Twine(Prefix) + "middle.block");
3284   LoopScalarPreHeader =
3285       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3286                  nullptr, Twine(Prefix) + "scalar.ph");
3287   // We intentionally don't let SplitBlock to update LoopInfo since
3288   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3289   // LoopVectorBody is explicitly added to the correct place few lines later.
3290   LoopVectorBody =
3291       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3292                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3293 
3294   // Update dominator for loop exit.
3295   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3296 
3297   // Create and register the new vector loop.
3298   Loop *Lp = LI->AllocateLoop();
3299   Loop *ParentLoop = OrigLoop->getParentLoop();
3300 
3301   // Insert the new loop into the loop nest and register the new basic blocks
3302   // before calling any utilities such as SCEV that require valid LoopInfo.
3303   if (ParentLoop) {
3304     ParentLoop->addChildLoop(Lp);
3305   } else {
3306     LI->addTopLevelLoop(Lp);
3307   }
3308   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3309   return Lp;
3310 }
3311 
3312 void InnerLoopVectorizer::createInductionResumeValues(
3313     Loop *L, Value *VectorTripCount,
3314     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3315   assert(VectorTripCount && L && "Expected valid arguments");
3316   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3317           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3318          "Inconsistent information about additional bypass.");
3319   // We are going to resume the execution of the scalar loop.
3320   // Go over all of the induction variables that we found and fix the
3321   // PHIs that are left in the scalar version of the loop.
3322   // The starting values of PHI nodes depend on the counter of the last
3323   // iteration in the vectorized loop.
3324   // If we come from a bypass edge then we need to start from the original
3325   // start value.
3326   for (auto &InductionEntry : Legal->getInductionVars()) {
3327     PHINode *OrigPhi = InductionEntry.first;
3328     InductionDescriptor II = InductionEntry.second;
3329 
3330     // Create phi nodes to merge from the  backedge-taken check block.
3331     PHINode *BCResumeVal =
3332         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3333                         LoopScalarPreHeader->getTerminator());
3334     // Copy original phi DL over to the new one.
3335     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3336     Value *&EndValue = IVEndValues[OrigPhi];
3337     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3338     if (OrigPhi == OldInduction) {
3339       // We know what the end value is.
3340       EndValue = VectorTripCount;
3341     } else {
3342       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3343       Type *StepType = II.getStep()->getType();
3344       Instruction::CastOps CastOp =
3345           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3346       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3347       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3348       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3349       EndValue->setName("ind.end");
3350 
3351       // Compute the end value for the additional bypass (if applicable).
3352       if (AdditionalBypass.first) {
3353         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3354         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3355                                          StepType, true);
3356         CRD =
3357             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3358         EndValueFromAdditionalBypass =
3359             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3360         EndValueFromAdditionalBypass->setName("ind.end");
3361       }
3362     }
3363     // The new PHI merges the original incoming value, in case of a bypass,
3364     // or the value at the end of the vectorized loop.
3365     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3366 
3367     // Fix the scalar body counter (PHI node).
3368     // The old induction's phi node in the scalar body needs the truncated
3369     // value.
3370     for (BasicBlock *BB : LoopBypassBlocks)
3371       BCResumeVal->addIncoming(II.getStartValue(), BB);
3372 
3373     if (AdditionalBypass.first)
3374       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3375                                             EndValueFromAdditionalBypass);
3376 
3377     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3378   }
3379 }
3380 
3381 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3382                                                       MDNode *OrigLoopID) {
3383   assert(L && "Expected valid loop.");
3384 
3385   // The trip counts should be cached by now.
3386   Value *Count = getOrCreateTripCount(L);
3387   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3388 
3389   // We need the OrigLoop (scalar loop part) latch terminator to help
3390   // produce correct debug info for the middle block BB instructions.
3391   // The legality check stage guarantees that the loop will have a single
3392   // latch.
3393   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3394          "Scalar loop latch terminator isn't a branch");
3395   BranchInst *ScalarLatchBr =
3396       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3397 
3398   // Add a check in the middle block to see if we have completed
3399   // all of the iterations in the first vector loop.
3400   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3401   // If tail is to be folded, we know we don't need to run the remainder.
3402   Value *CmpN = Builder.getTrue();
3403   if (!Cost->foldTailByMasking()) {
3404     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3405                            VectorTripCount, "cmp.n",
3406                            LoopMiddleBlock->getTerminator());
3407 
3408     // Here we use the same DebugLoc as the scalar loop latch branch instead
3409     // of the corresponding compare because they may have ended up with
3410     // different line numbers and we want to avoid awkward line stepping while
3411     // debugging. Eg. if the compare has got a line number inside the loop.
3412     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3413   }
3414 
3415   BranchInst *BrInst =
3416       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3417   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3418   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3419 
3420   // Get ready to start creating new instructions into the vectorized body.
3421   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3422          "Inconsistent vector loop preheader");
3423   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3424 
3425   Optional<MDNode *> VectorizedLoopID =
3426       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3427                                       LLVMLoopVectorizeFollowupVectorized});
3428   if (VectorizedLoopID.hasValue()) {
3429     L->setLoopID(VectorizedLoopID.getValue());
3430 
3431     // Do not setAlreadyVectorized if loop attributes have been defined
3432     // explicitly.
3433     return LoopVectorPreHeader;
3434   }
3435 
3436   // Keep all loop hints from the original loop on the vector loop (we'll
3437   // replace the vectorizer-specific hints below).
3438   if (MDNode *LID = OrigLoop->getLoopID())
3439     L->setLoopID(LID);
3440 
3441   LoopVectorizeHints Hints(L, true, *ORE);
3442   Hints.setAlreadyVectorized();
3443 
3444 #ifdef EXPENSIVE_CHECKS
3445   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3446   LI->verify(*DT);
3447 #endif
3448 
3449   return LoopVectorPreHeader;
3450 }
3451 
3452 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3453   /*
3454    In this function we generate a new loop. The new loop will contain
3455    the vectorized instructions while the old loop will continue to run the
3456    scalar remainder.
3457 
3458        [ ] <-- loop iteration number check.
3459     /   |
3460    /    v
3461   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3462   |  /  |
3463   | /   v
3464   ||   [ ]     <-- vector pre header.
3465   |/    |
3466   |     v
3467   |    [  ] \
3468   |    [  ]_|   <-- vector loop.
3469   |     |
3470   |     v
3471   |   -[ ]   <--- middle-block.
3472   |  /  |
3473   | /   v
3474   -|- >[ ]     <--- new preheader.
3475    |    |
3476    |    v
3477    |   [ ] \
3478    |   [ ]_|   <-- old scalar loop to handle remainder.
3479     \   |
3480      \  v
3481       >[ ]     <-- exit block.
3482    ...
3483    */
3484 
3485   // Get the metadata of the original loop before it gets modified.
3486   MDNode *OrigLoopID = OrigLoop->getLoopID();
3487 
3488   // Create an empty vector loop, and prepare basic blocks for the runtime
3489   // checks.
3490   Loop *Lp = createVectorLoopSkeleton("");
3491 
3492   // Now, compare the new count to zero. If it is zero skip the vector loop and
3493   // jump to the scalar loop. This check also covers the case where the
3494   // backedge-taken count is uint##_max: adding one to it will overflow leading
3495   // to an incorrect trip count of zero. In this (rare) case we will also jump
3496   // to the scalar loop.
3497   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3498 
3499   // Generate the code to check any assumptions that we've made for SCEV
3500   // expressions.
3501   emitSCEVChecks(Lp, LoopScalarPreHeader);
3502 
3503   // Generate the code that checks in runtime if arrays overlap. We put the
3504   // checks into a separate block to make the more common case of few elements
3505   // faster.
3506   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3507 
3508   // Some loops have a single integer induction variable, while other loops
3509   // don't. One example is c++ iterators that often have multiple pointer
3510   // induction variables. In the code below we also support a case where we
3511   // don't have a single induction variable.
3512   //
3513   // We try to obtain an induction variable from the original loop as hard
3514   // as possible. However if we don't find one that:
3515   //   - is an integer
3516   //   - counts from zero, stepping by one
3517   //   - is the size of the widest induction variable type
3518   // then we create a new one.
3519   OldInduction = Legal->getPrimaryInduction();
3520   Type *IdxTy = Legal->getWidestInductionType();
3521   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3522   // The loop step is equal to the vectorization factor (num of SIMD elements)
3523   // times the unroll factor (num of SIMD instructions).
3524   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3525   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
3526   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3527   Induction =
3528       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3529                               getDebugLocFromInstOrOperands(OldInduction));
3530 
3531   // Emit phis for the new starting index of the scalar loop.
3532   createInductionResumeValues(Lp, CountRoundDown);
3533 
3534   return completeLoopSkeleton(Lp, OrigLoopID);
3535 }
3536 
3537 // Fix up external users of the induction variable. At this point, we are
3538 // in LCSSA form, with all external PHIs that use the IV having one input value,
3539 // coming from the remainder loop. We need those PHIs to also have a correct
3540 // value for the IV when arriving directly from the middle block.
3541 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3542                                        const InductionDescriptor &II,
3543                                        Value *CountRoundDown, Value *EndValue,
3544                                        BasicBlock *MiddleBlock) {
3545   // There are two kinds of external IV usages - those that use the value
3546   // computed in the last iteration (the PHI) and those that use the penultimate
3547   // value (the value that feeds into the phi from the loop latch).
3548   // We allow both, but they, obviously, have different values.
3549 
3550   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3551 
3552   DenseMap<Value *, Value *> MissingVals;
3553 
3554   // An external user of the last iteration's value should see the value that
3555   // the remainder loop uses to initialize its own IV.
3556   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3557   for (User *U : PostInc->users()) {
3558     Instruction *UI = cast<Instruction>(U);
3559     if (!OrigLoop->contains(UI)) {
3560       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3561       MissingVals[UI] = EndValue;
3562     }
3563   }
3564 
3565   // An external user of the penultimate value need to see EndValue - Step.
3566   // The simplest way to get this is to recompute it from the constituent SCEVs,
3567   // that is Start + (Step * (CRD - 1)).
3568   for (User *U : OrigPhi->users()) {
3569     auto *UI = cast<Instruction>(U);
3570     if (!OrigLoop->contains(UI)) {
3571       const DataLayout &DL =
3572           OrigLoop->getHeader()->getModule()->getDataLayout();
3573       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3574 
3575       IRBuilder<> B(MiddleBlock->getTerminator());
3576       Value *CountMinusOne = B.CreateSub(
3577           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3578       Value *CMO =
3579           !II.getStep()->getType()->isIntegerTy()
3580               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3581                              II.getStep()->getType())
3582               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3583       CMO->setName("cast.cmo");
3584       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3585       Escape->setName("ind.escape");
3586       MissingVals[UI] = Escape;
3587     }
3588   }
3589 
3590   for (auto &I : MissingVals) {
3591     PHINode *PHI = cast<PHINode>(I.first);
3592     // One corner case we have to handle is two IVs "chasing" each-other,
3593     // that is %IV2 = phi [...], [ %IV1, %latch ]
3594     // In this case, if IV1 has an external use, we need to avoid adding both
3595     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3596     // don't already have an incoming value for the middle block.
3597     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3598       PHI->addIncoming(I.second, MiddleBlock);
3599   }
3600 }
3601 
3602 namespace {
3603 
3604 struct CSEDenseMapInfo {
3605   static bool canHandle(const Instruction *I) {
3606     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3607            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3608   }
3609 
3610   static inline Instruction *getEmptyKey() {
3611     return DenseMapInfo<Instruction *>::getEmptyKey();
3612   }
3613 
3614   static inline Instruction *getTombstoneKey() {
3615     return DenseMapInfo<Instruction *>::getTombstoneKey();
3616   }
3617 
3618   static unsigned getHashValue(const Instruction *I) {
3619     assert(canHandle(I) && "Unknown instruction!");
3620     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3621                                                            I->value_op_end()));
3622   }
3623 
3624   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3625     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3626         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3627       return LHS == RHS;
3628     return LHS->isIdenticalTo(RHS);
3629   }
3630 };
3631 
3632 } // end anonymous namespace
3633 
3634 ///Perform cse of induction variable instructions.
3635 static void cse(BasicBlock *BB) {
3636   // Perform simple cse.
3637   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3638   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3639     Instruction *In = &*I++;
3640 
3641     if (!CSEDenseMapInfo::canHandle(In))
3642       continue;
3643 
3644     // Check if we can replace this instruction with any of the
3645     // visited instructions.
3646     if (Instruction *V = CSEMap.lookup(In)) {
3647       In->replaceAllUsesWith(V);
3648       In->eraseFromParent();
3649       continue;
3650     }
3651 
3652     CSEMap[In] = In;
3653   }
3654 }
3655 
3656 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3657                                                        ElementCount VF,
3658                                                        bool &NeedToScalarize) {
3659   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3660   Function *F = CI->getCalledFunction();
3661   Type *ScalarRetTy = CI->getType();
3662   SmallVector<Type *, 4> Tys, ScalarTys;
3663   for (auto &ArgOp : CI->arg_operands())
3664     ScalarTys.push_back(ArgOp->getType());
3665 
3666   // Estimate cost of scalarized vector call. The source operands are assumed
3667   // to be vectors, so we need to extract individual elements from there,
3668   // execute VF scalar calls, and then gather the result into the vector return
3669   // value.
3670   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3671                                                  TTI::TCK_RecipThroughput);
3672   if (VF.isScalar())
3673     return ScalarCallCost;
3674 
3675   // Compute corresponding vector type for return value and arguments.
3676   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3677   for (Type *ScalarTy : ScalarTys)
3678     Tys.push_back(ToVectorTy(ScalarTy, VF));
3679 
3680   // Compute costs of unpacking argument values for the scalar calls and
3681   // packing the return values to a vector.
3682   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3683 
3684   unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3685 
3686   // If we can't emit a vector call for this function, then the currently found
3687   // cost is the cost we need to return.
3688   NeedToScalarize = true;
3689   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3690   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3691 
3692   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3693     return Cost;
3694 
3695   // If the corresponding vector cost is cheaper, return its cost.
3696   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3697                                                  TTI::TCK_RecipThroughput);
3698   if (VectorCallCost < Cost) {
3699     NeedToScalarize = false;
3700     return VectorCallCost;
3701   }
3702   return Cost;
3703 }
3704 
3705 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3706                                                             ElementCount VF) {
3707   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3708   assert(ID && "Expected intrinsic call!");
3709 
3710   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3711   return TTI.getIntrinsicInstrCost(CostAttrs,
3712                                    TargetTransformInfo::TCK_RecipThroughput);
3713 }
3714 
3715 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3716   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3717   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3718   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3719 }
3720 
3721 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3722   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3723   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3724   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3725 }
3726 
3727 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3728   // For every instruction `I` in MinBWs, truncate the operands, create a
3729   // truncated version of `I` and reextend its result. InstCombine runs
3730   // later and will remove any ext/trunc pairs.
3731   SmallPtrSet<Value *, 4> Erased;
3732   for (const auto &KV : Cost->getMinimalBitwidths()) {
3733     // If the value wasn't vectorized, we must maintain the original scalar
3734     // type. The absence of the value from VectorLoopValueMap indicates that it
3735     // wasn't vectorized.
3736     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3737       continue;
3738     for (unsigned Part = 0; Part < UF; ++Part) {
3739       Value *I = getOrCreateVectorValue(KV.first, Part);
3740       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3741         continue;
3742       Type *OriginalTy = I->getType();
3743       Type *ScalarTruncatedTy =
3744           IntegerType::get(OriginalTy->getContext(), KV.second);
3745       auto *TruncatedTy = FixedVectorType::get(
3746           ScalarTruncatedTy,
3747           cast<FixedVectorType>(OriginalTy)->getNumElements());
3748       if (TruncatedTy == OriginalTy)
3749         continue;
3750 
3751       IRBuilder<> B(cast<Instruction>(I));
3752       auto ShrinkOperand = [&](Value *V) -> Value * {
3753         if (auto *ZI = dyn_cast<ZExtInst>(V))
3754           if (ZI->getSrcTy() == TruncatedTy)
3755             return ZI->getOperand(0);
3756         return B.CreateZExtOrTrunc(V, TruncatedTy);
3757       };
3758 
3759       // The actual instruction modification depends on the instruction type,
3760       // unfortunately.
3761       Value *NewI = nullptr;
3762       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3763         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3764                              ShrinkOperand(BO->getOperand(1)));
3765 
3766         // Any wrapping introduced by shrinking this operation shouldn't be
3767         // considered undefined behavior. So, we can't unconditionally copy
3768         // arithmetic wrapping flags to NewI.
3769         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3770       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3771         NewI =
3772             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3773                          ShrinkOperand(CI->getOperand(1)));
3774       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3775         NewI = B.CreateSelect(SI->getCondition(),
3776                               ShrinkOperand(SI->getTrueValue()),
3777                               ShrinkOperand(SI->getFalseValue()));
3778       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3779         switch (CI->getOpcode()) {
3780         default:
3781           llvm_unreachable("Unhandled cast!");
3782         case Instruction::Trunc:
3783           NewI = ShrinkOperand(CI->getOperand(0));
3784           break;
3785         case Instruction::SExt:
3786           NewI = B.CreateSExtOrTrunc(
3787               CI->getOperand(0),
3788               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3789           break;
3790         case Instruction::ZExt:
3791           NewI = B.CreateZExtOrTrunc(
3792               CI->getOperand(0),
3793               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3794           break;
3795         }
3796       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3797         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3798                              ->getNumElements();
3799         auto *O0 = B.CreateZExtOrTrunc(
3800             SI->getOperand(0),
3801             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3802         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3803                              ->getNumElements();
3804         auto *O1 = B.CreateZExtOrTrunc(
3805             SI->getOperand(1),
3806             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3807 
3808         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3809       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3810         // Don't do anything with the operands, just extend the result.
3811         continue;
3812       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3813         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3814                             ->getNumElements();
3815         auto *O0 = B.CreateZExtOrTrunc(
3816             IE->getOperand(0),
3817             FixedVectorType::get(ScalarTruncatedTy, Elements));
3818         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3819         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3820       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3821         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3822                             ->getNumElements();
3823         auto *O0 = B.CreateZExtOrTrunc(
3824             EE->getOperand(0),
3825             FixedVectorType::get(ScalarTruncatedTy, Elements));
3826         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3827       } else {
3828         // If we don't know what to do, be conservative and don't do anything.
3829         continue;
3830       }
3831 
3832       // Lastly, extend the result.
3833       NewI->takeName(cast<Instruction>(I));
3834       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3835       I->replaceAllUsesWith(Res);
3836       cast<Instruction>(I)->eraseFromParent();
3837       Erased.insert(I);
3838       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3839     }
3840   }
3841 
3842   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3843   for (const auto &KV : Cost->getMinimalBitwidths()) {
3844     // If the value wasn't vectorized, we must maintain the original scalar
3845     // type. The absence of the value from VectorLoopValueMap indicates that it
3846     // wasn't vectorized.
3847     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3848       continue;
3849     for (unsigned Part = 0; Part < UF; ++Part) {
3850       Value *I = getOrCreateVectorValue(KV.first, Part);
3851       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3852       if (Inst && Inst->use_empty()) {
3853         Value *NewI = Inst->getOperand(0);
3854         Inst->eraseFromParent();
3855         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3856       }
3857     }
3858   }
3859 }
3860 
3861 void InnerLoopVectorizer::fixVectorizedLoop() {
3862   // Insert truncates and extends for any truncated instructions as hints to
3863   // InstCombine.
3864   if (VF.isVector())
3865     truncateToMinimalBitwidths();
3866 
3867   // Fix widened non-induction PHIs by setting up the PHI operands.
3868   if (OrigPHIsToFix.size()) {
3869     assert(EnableVPlanNativePath &&
3870            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3871     fixNonInductionPHIs();
3872   }
3873 
3874   // At this point every instruction in the original loop is widened to a
3875   // vector form. Now we need to fix the recurrences in the loop. These PHI
3876   // nodes are currently empty because we did not want to introduce cycles.
3877   // This is the second stage of vectorizing recurrences.
3878   fixCrossIterationPHIs();
3879 
3880   // Forget the original basic block.
3881   PSE.getSE()->forgetLoop(OrigLoop);
3882 
3883   // Fix-up external users of the induction variables.
3884   for (auto &Entry : Legal->getInductionVars())
3885     fixupIVUsers(Entry.first, Entry.second,
3886                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3887                  IVEndValues[Entry.first], LoopMiddleBlock);
3888 
3889   fixLCSSAPHIs();
3890   for (Instruction *PI : PredicatedInstructions)
3891     sinkScalarOperands(&*PI);
3892 
3893   // Remove redundant induction instructions.
3894   cse(LoopVectorBody);
3895 
3896   // Set/update profile weights for the vector and remainder loops as original
3897   // loop iterations are now distributed among them. Note that original loop
3898   // represented by LoopScalarBody becomes remainder loop after vectorization.
3899   //
3900   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3901   // end up getting slightly roughened result but that should be OK since
3902   // profile is not inherently precise anyway. Note also possible bypass of
3903   // vector code caused by legality checks is ignored, assigning all the weight
3904   // to the vector loop, optimistically.
3905   assert(!VF.isScalable() &&
3906          "cannot use scalable ElementCount to determine unroll factor");
3907   setProfileInfoAfterUnrolling(
3908       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3909       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3910 }
3911 
3912 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3913   // In order to support recurrences we need to be able to vectorize Phi nodes.
3914   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3915   // stage #2: We now need to fix the recurrences by adding incoming edges to
3916   // the currently empty PHI nodes. At this point every instruction in the
3917   // original loop is widened to a vector form so we can use them to construct
3918   // the incoming edges.
3919   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3920     // Handle first-order recurrences and reductions that need to be fixed.
3921     if (Legal->isFirstOrderRecurrence(&Phi))
3922       fixFirstOrderRecurrence(&Phi);
3923     else if (Legal->isReductionVariable(&Phi))
3924       fixReduction(&Phi);
3925   }
3926 }
3927 
3928 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3929   // This is the second phase of vectorizing first-order recurrences. An
3930   // overview of the transformation is described below. Suppose we have the
3931   // following loop.
3932   //
3933   //   for (int i = 0; i < n; ++i)
3934   //     b[i] = a[i] - a[i - 1];
3935   //
3936   // There is a first-order recurrence on "a". For this loop, the shorthand
3937   // scalar IR looks like:
3938   //
3939   //   scalar.ph:
3940   //     s_init = a[-1]
3941   //     br scalar.body
3942   //
3943   //   scalar.body:
3944   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3945   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3946   //     s2 = a[i]
3947   //     b[i] = s2 - s1
3948   //     br cond, scalar.body, ...
3949   //
3950   // In this example, s1 is a recurrence because it's value depends on the
3951   // previous iteration. In the first phase of vectorization, we created a
3952   // temporary value for s1. We now complete the vectorization and produce the
3953   // shorthand vector IR shown below (for VF = 4, UF = 1).
3954   //
3955   //   vector.ph:
3956   //     v_init = vector(..., ..., ..., a[-1])
3957   //     br vector.body
3958   //
3959   //   vector.body
3960   //     i = phi [0, vector.ph], [i+4, vector.body]
3961   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3962   //     v2 = a[i, i+1, i+2, i+3];
3963   //     v3 = vector(v1(3), v2(0, 1, 2))
3964   //     b[i, i+1, i+2, i+3] = v2 - v3
3965   //     br cond, vector.body, middle.block
3966   //
3967   //   middle.block:
3968   //     x = v2(3)
3969   //     br scalar.ph
3970   //
3971   //   scalar.ph:
3972   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3973   //     br scalar.body
3974   //
3975   // After execution completes the vector loop, we extract the next value of
3976   // the recurrence (x) to use as the initial value in the scalar loop.
3977 
3978   // Get the original loop preheader and single loop latch.
3979   auto *Preheader = OrigLoop->getLoopPreheader();
3980   auto *Latch = OrigLoop->getLoopLatch();
3981 
3982   // Get the initial and previous values of the scalar recurrence.
3983   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3984   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3985 
3986   // Create a vector from the initial value.
3987   auto *VectorInit = ScalarInit;
3988   if (VF.isVector()) {
3989     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3990     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
3991     VectorInit = Builder.CreateInsertElement(
3992         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3993         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
3994   }
3995 
3996   // We constructed a temporary phi node in the first phase of vectorization.
3997   // This phi node will eventually be deleted.
3998   Builder.SetInsertPoint(
3999       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4000 
4001   // Create a phi node for the new recurrence. The current value will either be
4002   // the initial value inserted into a vector or loop-varying vector value.
4003   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4004   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4005 
4006   // Get the vectorized previous value of the last part UF - 1. It appears last
4007   // among all unrolled iterations, due to the order of their construction.
4008   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4009 
4010   // Find and set the insertion point after the previous value if it is an
4011   // instruction.
4012   BasicBlock::iterator InsertPt;
4013   // Note that the previous value may have been constant-folded so it is not
4014   // guaranteed to be an instruction in the vector loop.
4015   // FIXME: Loop invariant values do not form recurrences. We should deal with
4016   //        them earlier.
4017   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4018     InsertPt = LoopVectorBody->getFirstInsertionPt();
4019   else {
4020     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4021     if (isa<PHINode>(PreviousLastPart))
4022       // If the previous value is a phi node, we should insert after all the phi
4023       // nodes in the block containing the PHI to avoid breaking basic block
4024       // verification. Note that the basic block may be different to
4025       // LoopVectorBody, in case we predicate the loop.
4026       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4027     else
4028       InsertPt = ++PreviousInst->getIterator();
4029   }
4030   Builder.SetInsertPoint(&*InsertPt);
4031 
4032   // We will construct a vector for the recurrence by combining the values for
4033   // the current and previous iterations. This is the required shuffle mask.
4034   assert(!VF.isScalable());
4035   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4036   ShuffleMask[0] = VF.getKnownMinValue() - 1;
4037   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4038     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4039 
4040   // The vector from which to take the initial value for the current iteration
4041   // (actual or unrolled). Initially, this is the vector phi node.
4042   Value *Incoming = VecPhi;
4043 
4044   // Shuffle the current and previous vector and update the vector parts.
4045   for (unsigned Part = 0; Part < UF; ++Part) {
4046     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4047     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4048     auto *Shuffle =
4049         VF.isVector()
4050             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4051             : Incoming;
4052     PhiPart->replaceAllUsesWith(Shuffle);
4053     cast<Instruction>(PhiPart)->eraseFromParent();
4054     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4055     Incoming = PreviousPart;
4056   }
4057 
4058   // Fix the latch value of the new recurrence in the vector loop.
4059   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4060 
4061   // Extract the last vector element in the middle block. This will be the
4062   // initial value for the recurrence when jumping to the scalar loop.
4063   auto *ExtractForScalar = Incoming;
4064   if (VF.isVector()) {
4065     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4066     ExtractForScalar = Builder.CreateExtractElement(
4067         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4068         "vector.recur.extract");
4069   }
4070   // Extract the second last element in the middle block if the
4071   // Phi is used outside the loop. We need to extract the phi itself
4072   // and not the last element (the phi update in the current iteration). This
4073   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4074   // when the scalar loop is not run at all.
4075   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4076   if (VF.isVector())
4077     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4078         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4079         "vector.recur.extract.for.phi");
4080   // When loop is unrolled without vectorizing, initialize
4081   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4082   // `Incoming`. This is analogous to the vectorized case above: extracting the
4083   // second last element when VF > 1.
4084   else if (UF > 1)
4085     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4086 
4087   // Fix the initial value of the original recurrence in the scalar loop.
4088   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4089   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4090   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4091     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4092     Start->addIncoming(Incoming, BB);
4093   }
4094 
4095   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4096   Phi->setName("scalar.recur");
4097 
4098   // Finally, fix users of the recurrence outside the loop. The users will need
4099   // either the last value of the scalar recurrence or the last value of the
4100   // vector recurrence we extracted in the middle block. Since the loop is in
4101   // LCSSA form, we just need to find all the phi nodes for the original scalar
4102   // recurrence in the exit block, and then add an edge for the middle block.
4103   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4104     if (LCSSAPhi.getIncomingValue(0) == Phi) {
4105       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4106     }
4107   }
4108 }
4109 
4110 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
4111   Constant *Zero = Builder.getInt32(0);
4112 
4113   // Get it's reduction variable descriptor.
4114   assert(Legal->isReductionVariable(Phi) &&
4115          "Unable to find the reduction variable");
4116   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4117 
4118   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4119   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4120   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4121   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
4122     RdxDesc.getMinMaxRecurrenceKind();
4123   setDebugLocFromInst(Builder, ReductionStartValue);
4124   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4125 
4126   // We need to generate a reduction vector from the incoming scalar.
4127   // To do so, we need to generate the 'identity' vector and override
4128   // one of the elements with the incoming scalar reduction. We need
4129   // to do it in the vector-loop preheader.
4130   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4131 
4132   // This is the vector-clone of the value that leaves the loop.
4133   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4134 
4135   // Find the reduction identity variable. Zero for addition, or, xor,
4136   // one for multiplication, -1 for And.
4137   Value *Identity;
4138   Value *VectorStart;
4139   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
4140       RK == RecurrenceDescriptor::RK_FloatMinMax) {
4141     // MinMax reduction have the start value as their identify.
4142     if (VF.isScalar() || IsInLoopReductionPhi) {
4143       VectorStart = Identity = ReductionStartValue;
4144     } else {
4145       VectorStart = Identity =
4146         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
4147     }
4148   } else {
4149     // Handle other reduction kinds:
4150     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
4151         RK, MinMaxKind, VecTy->getScalarType());
4152     if (VF.isScalar() || IsInLoopReductionPhi) {
4153       Identity = Iden;
4154       // This vector is the Identity vector where the first element is the
4155       // incoming scalar reduction.
4156       VectorStart = ReductionStartValue;
4157     } else {
4158       Identity = ConstantVector::getSplat(VF, Iden);
4159 
4160       // This vector is the Identity vector where the first element is the
4161       // incoming scalar reduction.
4162       VectorStart =
4163         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
4164     }
4165   }
4166 
4167   // Wrap flags are in general invalid after vectorization, clear them.
4168   clearReductionWrapFlags(RdxDesc);
4169 
4170   // Fix the vector-loop phi.
4171 
4172   // Reductions do not have to start at zero. They can start with
4173   // any loop invariant values.
4174   BasicBlock *Latch = OrigLoop->getLoopLatch();
4175   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4176 
4177   for (unsigned Part = 0; Part < UF; ++Part) {
4178     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4179     Value *Val = getOrCreateVectorValue(LoopVal, Part);
4180     // Make sure to add the reduction start value only to the
4181     // first unroll part.
4182     Value *StartVal = (Part == 0) ? VectorStart : Identity;
4183     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
4184     cast<PHINode>(VecRdxPhi)
4185       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4186   }
4187 
4188   // Before each round, move the insertion point right between
4189   // the PHIs and the values we are going to write.
4190   // This allows us to write both PHINodes and the extractelement
4191   // instructions.
4192   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4193 
4194   setDebugLocFromInst(Builder, LoopExitInst);
4195 
4196   // If tail is folded by masking, the vector value to leave the loop should be
4197   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4198   // instead of the former. For an inloop reduction the reduction will already
4199   // be predicated, and does not need to be handled here.
4200   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4201     for (unsigned Part = 0; Part < UF; ++Part) {
4202       Value *VecLoopExitInst =
4203           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4204       Value *Sel = nullptr;
4205       for (User *U : VecLoopExitInst->users()) {
4206         if (isa<SelectInst>(U)) {
4207           assert(!Sel && "Reduction exit feeding two selects");
4208           Sel = U;
4209         } else
4210           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4211       }
4212       assert(Sel && "Reduction exit feeds no select");
4213       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4214 
4215       // If the target can create a predicated operator for the reduction at no
4216       // extra cost in the loop (for example a predicated vadd), it can be
4217       // cheaper for the select to remain in the loop than be sunk out of it,
4218       // and so use the select value for the phi instead of the old
4219       // LoopExitValue.
4220       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4221       if (PreferPredicatedReductionSelect ||
4222           TTI->preferPredicatedReductionSelect(
4223               RdxDesc.getRecurrenceBinOp(), Phi->getType(),
4224               TargetTransformInfo::ReductionFlags())) {
4225         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4226         VecRdxPhi->setIncomingValueForBlock(
4227             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4228       }
4229     }
4230   }
4231 
4232   // If the vector reduction can be performed in a smaller type, we truncate
4233   // then extend the loop exit value to enable InstCombine to evaluate the
4234   // entire expression in the smaller type.
4235   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4236     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4237     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4238     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4239     Builder.SetInsertPoint(
4240         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4241     VectorParts RdxParts(UF);
4242     for (unsigned Part = 0; Part < UF; ++Part) {
4243       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4244       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4245       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4246                                         : Builder.CreateZExt(Trunc, VecTy);
4247       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4248            UI != RdxParts[Part]->user_end();)
4249         if (*UI != Trunc) {
4250           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4251           RdxParts[Part] = Extnd;
4252         } else {
4253           ++UI;
4254         }
4255     }
4256     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4257     for (unsigned Part = 0; Part < UF; ++Part) {
4258       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4259       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4260     }
4261   }
4262 
4263   // Reduce all of the unrolled parts into a single vector.
4264   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4265   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4266 
4267   // The middle block terminator has already been assigned a DebugLoc here (the
4268   // OrigLoop's single latch terminator). We want the whole middle block to
4269   // appear to execute on this line because: (a) it is all compiler generated,
4270   // (b) these instructions are always executed after evaluating the latch
4271   // conditional branch, and (c) other passes may add new predecessors which
4272   // terminate on this line. This is the easiest way to ensure we don't
4273   // accidentally cause an extra step back into the loop while debugging.
4274   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4275   for (unsigned Part = 1; Part < UF; ++Part) {
4276     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4277     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4278       // Floating point operations had to be 'fast' to enable the reduction.
4279       ReducedPartRdx = addFastMathFlag(
4280           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4281                               ReducedPartRdx, "bin.rdx"),
4282           RdxDesc.getFastMathFlags());
4283     else
4284       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4285                                       RdxPart);
4286   }
4287 
4288   // Create the reduction after the loop. Note that inloop reductions create the
4289   // target reduction in the loop using a Reduction recipe.
4290   if (VF.isVector() && !IsInLoopReductionPhi) {
4291     bool NoNaN = Legal->hasFunNoNaNAttr();
4292     ReducedPartRdx =
4293         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4294     // If the reduction can be performed in a smaller type, we need to extend
4295     // the reduction to the wider type before we branch to the original loop.
4296     if (Phi->getType() != RdxDesc.getRecurrenceType())
4297       ReducedPartRdx =
4298         RdxDesc.isSigned()
4299         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4300         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4301   }
4302 
4303   // Create a phi node that merges control-flow from the backedge-taken check
4304   // block and the middle block.
4305   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4306                                         LoopScalarPreHeader->getTerminator());
4307   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4308     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4309   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4310 
4311   // Now, we need to fix the users of the reduction variable
4312   // inside and outside of the scalar remainder loop.
4313   // We know that the loop is in LCSSA form. We need to update the
4314   // PHI nodes in the exit blocks.
4315   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4316     // All PHINodes need to have a single entry edge, or two if
4317     // we already fixed them.
4318     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4319 
4320     // We found a reduction value exit-PHI. Update it with the
4321     // incoming bypass edge.
4322     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4323       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4324   } // end of the LCSSA phi scan.
4325 
4326     // Fix the scalar loop reduction variable with the incoming reduction sum
4327     // from the vector body and from the backedge value.
4328   int IncomingEdgeBlockIdx =
4329     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4330   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4331   // Pick the other block.
4332   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4333   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4334   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4335 }
4336 
4337 void InnerLoopVectorizer::clearReductionWrapFlags(
4338     RecurrenceDescriptor &RdxDesc) {
4339   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4340   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4341       RK != RecurrenceDescriptor::RK_IntegerMult)
4342     return;
4343 
4344   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4345   assert(LoopExitInstr && "null loop exit instruction");
4346   SmallVector<Instruction *, 8> Worklist;
4347   SmallPtrSet<Instruction *, 8> Visited;
4348   Worklist.push_back(LoopExitInstr);
4349   Visited.insert(LoopExitInstr);
4350 
4351   while (!Worklist.empty()) {
4352     Instruction *Cur = Worklist.pop_back_val();
4353     if (isa<OverflowingBinaryOperator>(Cur))
4354       for (unsigned Part = 0; Part < UF; ++Part) {
4355         Value *V = getOrCreateVectorValue(Cur, Part);
4356         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4357       }
4358 
4359     for (User *U : Cur->users()) {
4360       Instruction *UI = cast<Instruction>(U);
4361       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4362           Visited.insert(UI).second)
4363         Worklist.push_back(UI);
4364     }
4365   }
4366 }
4367 
4368 void InnerLoopVectorizer::fixLCSSAPHIs() {
4369   assert(!VF.isScalable() && "the code below assumes fixed width vectors");
4370   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4371     if (LCSSAPhi.getNumIncomingValues() == 1) {
4372       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4373       // Non-instruction incoming values will have only one value.
4374       unsigned LastLane = 0;
4375       if (isa<Instruction>(IncomingValue))
4376         LastLane = Cost->isUniformAfterVectorization(
4377                        cast<Instruction>(IncomingValue), VF)
4378                        ? 0
4379                        : VF.getKnownMinValue() - 1;
4380       // Can be a loop invariant incoming value or the last scalar value to be
4381       // extracted from the vectorized loop.
4382       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4383       Value *lastIncomingValue =
4384           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4385       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4386     }
4387   }
4388 }
4389 
4390 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4391   // The basic block and loop containing the predicated instruction.
4392   auto *PredBB = PredInst->getParent();
4393   auto *VectorLoop = LI->getLoopFor(PredBB);
4394 
4395   // Initialize a worklist with the operands of the predicated instruction.
4396   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4397 
4398   // Holds instructions that we need to analyze again. An instruction may be
4399   // reanalyzed if we don't yet know if we can sink it or not.
4400   SmallVector<Instruction *, 8> InstsToReanalyze;
4401 
4402   // Returns true if a given use occurs in the predicated block. Phi nodes use
4403   // their operands in their corresponding predecessor blocks.
4404   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4405     auto *I = cast<Instruction>(U.getUser());
4406     BasicBlock *BB = I->getParent();
4407     if (auto *Phi = dyn_cast<PHINode>(I))
4408       BB = Phi->getIncomingBlock(
4409           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4410     return BB == PredBB;
4411   };
4412 
4413   // Iteratively sink the scalarized operands of the predicated instruction
4414   // into the block we created for it. When an instruction is sunk, it's
4415   // operands are then added to the worklist. The algorithm ends after one pass
4416   // through the worklist doesn't sink a single instruction.
4417   bool Changed;
4418   do {
4419     // Add the instructions that need to be reanalyzed to the worklist, and
4420     // reset the changed indicator.
4421     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4422     InstsToReanalyze.clear();
4423     Changed = false;
4424 
4425     while (!Worklist.empty()) {
4426       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4427 
4428       // We can't sink an instruction if it is a phi node, is already in the
4429       // predicated block, is not in the loop, or may have side effects.
4430       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4431           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4432         continue;
4433 
4434       // It's legal to sink the instruction if all its uses occur in the
4435       // predicated block. Otherwise, there's nothing to do yet, and we may
4436       // need to reanalyze the instruction.
4437       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4438         InstsToReanalyze.push_back(I);
4439         continue;
4440       }
4441 
4442       // Move the instruction to the beginning of the predicated block, and add
4443       // it's operands to the worklist.
4444       I->moveBefore(&*PredBB->getFirstInsertionPt());
4445       Worklist.insert(I->op_begin(), I->op_end());
4446 
4447       // The sinking may have enabled other instructions to be sunk, so we will
4448       // need to iterate.
4449       Changed = true;
4450     }
4451   } while (Changed);
4452 }
4453 
4454 void InnerLoopVectorizer::fixNonInductionPHIs() {
4455   for (PHINode *OrigPhi : OrigPHIsToFix) {
4456     PHINode *NewPhi =
4457         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4458     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4459 
4460     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4461         predecessors(OrigPhi->getParent()));
4462     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4463         predecessors(NewPhi->getParent()));
4464     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4465            "Scalar and Vector BB should have the same number of predecessors");
4466 
4467     // The insertion point in Builder may be invalidated by the time we get
4468     // here. Force the Builder insertion point to something valid so that we do
4469     // not run into issues during insertion point restore in
4470     // getOrCreateVectorValue calls below.
4471     Builder.SetInsertPoint(NewPhi);
4472 
4473     // The predecessor order is preserved and we can rely on mapping between
4474     // scalar and vector block predecessors.
4475     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4476       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4477 
4478       // When looking up the new scalar/vector values to fix up, use incoming
4479       // values from original phi.
4480       Value *ScIncV =
4481           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4482 
4483       // Scalar incoming value may need a broadcast
4484       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4485       NewPhi->addIncoming(NewIncV, NewPredBB);
4486     }
4487   }
4488 }
4489 
4490 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4491                                    VPUser &Operands, unsigned UF,
4492                                    ElementCount VF, bool IsPtrLoopInvariant,
4493                                    SmallBitVector &IsIndexLoopInvariant,
4494                                    VPTransformState &State) {
4495   // Construct a vector GEP by widening the operands of the scalar GEP as
4496   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4497   // results in a vector of pointers when at least one operand of the GEP
4498   // is vector-typed. Thus, to keep the representation compact, we only use
4499   // vector-typed operands for loop-varying values.
4500 
4501   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4502     // If we are vectorizing, but the GEP has only loop-invariant operands,
4503     // the GEP we build (by only using vector-typed operands for
4504     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4505     // produce a vector of pointers, we need to either arbitrarily pick an
4506     // operand to broadcast, or broadcast a clone of the original GEP.
4507     // Here, we broadcast a clone of the original.
4508     //
4509     // TODO: If at some point we decide to scalarize instructions having
4510     //       loop-invariant operands, this special case will no longer be
4511     //       required. We would add the scalarization decision to
4512     //       collectLoopScalars() and teach getVectorValue() to broadcast
4513     //       the lane-zero scalar value.
4514     auto *Clone = Builder.Insert(GEP->clone());
4515     for (unsigned Part = 0; Part < UF; ++Part) {
4516       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4517       State.set(VPDef, GEP, EntryPart, Part);
4518       addMetadata(EntryPart, GEP);
4519     }
4520   } else {
4521     // If the GEP has at least one loop-varying operand, we are sure to
4522     // produce a vector of pointers. But if we are only unrolling, we want
4523     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4524     // produce with the code below will be scalar (if VF == 1) or vector
4525     // (otherwise). Note that for the unroll-only case, we still maintain
4526     // values in the vector mapping with initVector, as we do for other
4527     // instructions.
4528     for (unsigned Part = 0; Part < UF; ++Part) {
4529       // The pointer operand of the new GEP. If it's loop-invariant, we
4530       // won't broadcast it.
4531       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4532                                      : State.get(Operands.getOperand(0), Part);
4533 
4534       // Collect all the indices for the new GEP. If any index is
4535       // loop-invariant, we won't broadcast it.
4536       SmallVector<Value *, 4> Indices;
4537       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4538         VPValue *Operand = Operands.getOperand(I);
4539         if (IsIndexLoopInvariant[I - 1])
4540           Indices.push_back(State.get(Operand, {0, 0}));
4541         else
4542           Indices.push_back(State.get(Operand, Part));
4543       }
4544 
4545       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4546       // but it should be a vector, otherwise.
4547       auto *NewGEP =
4548           GEP->isInBounds()
4549               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4550                                           Indices)
4551               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4552       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4553              "NewGEP is not a pointer vector");
4554       State.set(VPDef, GEP, NewGEP, Part);
4555       addMetadata(NewGEP, GEP);
4556     }
4557   }
4558 }
4559 
4560 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4561                                               ElementCount VF) {
4562   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4563   PHINode *P = cast<PHINode>(PN);
4564   if (EnableVPlanNativePath) {
4565     // Currently we enter here in the VPlan-native path for non-induction
4566     // PHIs where all control flow is uniform. We simply widen these PHIs.
4567     // Create a vector phi with no operands - the vector phi operands will be
4568     // set at the end of vector code generation.
4569     Type *VecTy =
4570         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4571     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4572     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4573     OrigPHIsToFix.push_back(P);
4574 
4575     return;
4576   }
4577 
4578   assert(PN->getParent() == OrigLoop->getHeader() &&
4579          "Non-header phis should have been handled elsewhere");
4580 
4581   // In order to support recurrences we need to be able to vectorize Phi nodes.
4582   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4583   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4584   // this value when we vectorize all of the instructions that use the PHI.
4585   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4586     for (unsigned Part = 0; Part < UF; ++Part) {
4587       // This is phase one of vectorizing PHIs.
4588       bool ScalarPHI =
4589           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4590       Type *VecTy =
4591           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4592       Value *EntryPart = PHINode::Create(
4593           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4594       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4595     }
4596     return;
4597   }
4598 
4599   setDebugLocFromInst(Builder, P);
4600 
4601   // This PHINode must be an induction variable.
4602   // Make sure that we know about it.
4603   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4604 
4605   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4606   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4607 
4608   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4609   // which can be found from the original scalar operations.
4610   switch (II.getKind()) {
4611   case InductionDescriptor::IK_NoInduction:
4612     llvm_unreachable("Unknown induction");
4613   case InductionDescriptor::IK_IntInduction:
4614   case InductionDescriptor::IK_FpInduction:
4615     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4616   case InductionDescriptor::IK_PtrInduction: {
4617     // Handle the pointer induction variable case.
4618     assert(P->getType()->isPointerTy() && "Unexpected type.");
4619 
4620     if (Cost->isScalarAfterVectorization(P, VF)) {
4621       // This is the normalized GEP that starts counting at zero.
4622       Value *PtrInd =
4623           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4624       // Determine the number of scalars we need to generate for each unroll
4625       // iteration. If the instruction is uniform, we only need to generate the
4626       // first lane. Otherwise, we generate all VF values.
4627       unsigned Lanes =
4628           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4629       for (unsigned Part = 0; Part < UF; ++Part) {
4630         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4631           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4632                                            Lane + Part * VF.getKnownMinValue());
4633           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4634           Value *SclrGep =
4635               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4636           SclrGep->setName("next.gep");
4637           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4638         }
4639       }
4640       return;
4641     }
4642     assert(isa<SCEVConstant>(II.getStep()) &&
4643            "Induction step not a SCEV constant!");
4644     Type *PhiType = II.getStep()->getType();
4645 
4646     // Build a pointer phi
4647     Value *ScalarStartValue = II.getStartValue();
4648     Type *ScStValueType = ScalarStartValue->getType();
4649     PHINode *NewPointerPhi =
4650         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4651     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4652 
4653     // A pointer induction, performed by using a gep
4654     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4655     Instruction *InductionLoc = LoopLatch->getTerminator();
4656     const SCEV *ScalarStep = II.getStep();
4657     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4658     Value *ScalarStepValue =
4659         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4660     Value *InductionGEP = GetElementPtrInst::Create(
4661         ScStValueType->getPointerElementType(), NewPointerPhi,
4662         Builder.CreateMul(
4663             ScalarStepValue,
4664             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4665         "ptr.ind", InductionLoc);
4666     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4667 
4668     // Create UF many actual address geps that use the pointer
4669     // phi as base and a vectorized version of the step value
4670     // (<step*0, ..., step*N>) as offset.
4671     for (unsigned Part = 0; Part < UF; ++Part) {
4672       SmallVector<Constant *, 8> Indices;
4673       // Create a vector of consecutive numbers from zero to VF.
4674       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4675         Indices.push_back(
4676             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4677       Constant *StartOffset = ConstantVector::get(Indices);
4678 
4679       Value *GEP = Builder.CreateGEP(
4680           ScStValueType->getPointerElementType(), NewPointerPhi,
4681           Builder.CreateMul(
4682               StartOffset,
4683               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4684               "vector.gep"));
4685       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4686     }
4687   }
4688   }
4689 }
4690 
4691 /// A helper function for checking whether an integer division-related
4692 /// instruction may divide by zero (in which case it must be predicated if
4693 /// executed conditionally in the scalar code).
4694 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4695 /// Non-zero divisors that are non compile-time constants will not be
4696 /// converted into multiplication, so we will still end up scalarizing
4697 /// the division, but can do so w/o predication.
4698 static bool mayDivideByZero(Instruction &I) {
4699   assert((I.getOpcode() == Instruction::UDiv ||
4700           I.getOpcode() == Instruction::SDiv ||
4701           I.getOpcode() == Instruction::URem ||
4702           I.getOpcode() == Instruction::SRem) &&
4703          "Unexpected instruction");
4704   Value *Divisor = I.getOperand(1);
4705   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4706   return !CInt || CInt->isZero();
4707 }
4708 
4709 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4710                                            VPUser &User,
4711                                            VPTransformState &State) {
4712   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4713   switch (I.getOpcode()) {
4714   case Instruction::Call:
4715   case Instruction::Br:
4716   case Instruction::PHI:
4717   case Instruction::GetElementPtr:
4718   case Instruction::Select:
4719     llvm_unreachable("This instruction is handled by a different recipe.");
4720   case Instruction::UDiv:
4721   case Instruction::SDiv:
4722   case Instruction::SRem:
4723   case Instruction::URem:
4724   case Instruction::Add:
4725   case Instruction::FAdd:
4726   case Instruction::Sub:
4727   case Instruction::FSub:
4728   case Instruction::FNeg:
4729   case Instruction::Mul:
4730   case Instruction::FMul:
4731   case Instruction::FDiv:
4732   case Instruction::FRem:
4733   case Instruction::Shl:
4734   case Instruction::LShr:
4735   case Instruction::AShr:
4736   case Instruction::And:
4737   case Instruction::Or:
4738   case Instruction::Xor: {
4739     // Just widen unops and binops.
4740     setDebugLocFromInst(Builder, &I);
4741 
4742     for (unsigned Part = 0; Part < UF; ++Part) {
4743       SmallVector<Value *, 2> Ops;
4744       for (VPValue *VPOp : User.operands())
4745         Ops.push_back(State.get(VPOp, Part));
4746 
4747       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4748 
4749       if (auto *VecOp = dyn_cast<Instruction>(V))
4750         VecOp->copyIRFlags(&I);
4751 
4752       // Use this vector value for all users of the original instruction.
4753       State.set(Def, &I, V, Part);
4754       addMetadata(V, &I);
4755     }
4756 
4757     break;
4758   }
4759   case Instruction::ICmp:
4760   case Instruction::FCmp: {
4761     // Widen compares. Generate vector compares.
4762     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4763     auto *Cmp = cast<CmpInst>(&I);
4764     setDebugLocFromInst(Builder, Cmp);
4765     for (unsigned Part = 0; Part < UF; ++Part) {
4766       Value *A = State.get(User.getOperand(0), Part);
4767       Value *B = State.get(User.getOperand(1), Part);
4768       Value *C = nullptr;
4769       if (FCmp) {
4770         // Propagate fast math flags.
4771         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4772         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4773         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4774       } else {
4775         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4776       }
4777       State.set(Def, &I, C, Part);
4778       addMetadata(C, &I);
4779     }
4780 
4781     break;
4782   }
4783 
4784   case Instruction::ZExt:
4785   case Instruction::SExt:
4786   case Instruction::FPToUI:
4787   case Instruction::FPToSI:
4788   case Instruction::FPExt:
4789   case Instruction::PtrToInt:
4790   case Instruction::IntToPtr:
4791   case Instruction::SIToFP:
4792   case Instruction::UIToFP:
4793   case Instruction::Trunc:
4794   case Instruction::FPTrunc:
4795   case Instruction::BitCast: {
4796     auto *CI = cast<CastInst>(&I);
4797     setDebugLocFromInst(Builder, CI);
4798 
4799     /// Vectorize casts.
4800     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4801     Type *DestTy =
4802         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4803 
4804     for (unsigned Part = 0; Part < UF; ++Part) {
4805       Value *A = State.get(User.getOperand(0), Part);
4806       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4807       State.set(Def, &I, Cast, Part);
4808       addMetadata(Cast, &I);
4809     }
4810     break;
4811   }
4812   default:
4813     // This instruction is not vectorized by simple widening.
4814     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4815     llvm_unreachable("Unhandled instruction!");
4816   } // end of switch.
4817 }
4818 
4819 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4820                                                VPUser &ArgOperands,
4821                                                VPTransformState &State) {
4822   assert(!isa<DbgInfoIntrinsic>(I) &&
4823          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4824   setDebugLocFromInst(Builder, &I);
4825 
4826   Module *M = I.getParent()->getParent()->getParent();
4827   auto *CI = cast<CallInst>(&I);
4828 
4829   SmallVector<Type *, 4> Tys;
4830   for (Value *ArgOperand : CI->arg_operands())
4831     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4832 
4833   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4834 
4835   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4836   // version of the instruction.
4837   // Is it beneficial to perform intrinsic call compared to lib call?
4838   bool NeedToScalarize = false;
4839   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4840   bool UseVectorIntrinsic =
4841       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4842   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4843          "Instruction should be scalarized elsewhere.");
4844 
4845   for (unsigned Part = 0; Part < UF; ++Part) {
4846     SmallVector<Value *, 4> Args;
4847     for (auto &I : enumerate(ArgOperands.operands())) {
4848       // Some intrinsics have a scalar argument - don't replace it with a
4849       // vector.
4850       Value *Arg;
4851       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4852         Arg = State.get(I.value(), Part);
4853       else
4854         Arg = State.get(I.value(), {0, 0});
4855       Args.push_back(Arg);
4856     }
4857 
4858     Function *VectorF;
4859     if (UseVectorIntrinsic) {
4860       // Use vector version of the intrinsic.
4861       Type *TysForDecl[] = {CI->getType()};
4862       if (VF.isVector()) {
4863         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4864         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4865       }
4866       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4867       assert(VectorF && "Can't retrieve vector intrinsic.");
4868     } else {
4869       // Use vector version of the function call.
4870       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4871 #ifndef NDEBUG
4872       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4873              "Can't create vector function.");
4874 #endif
4875         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4876     }
4877       SmallVector<OperandBundleDef, 1> OpBundles;
4878       CI->getOperandBundlesAsDefs(OpBundles);
4879       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4880 
4881       if (isa<FPMathOperator>(V))
4882         V->copyFastMathFlags(CI);
4883 
4884       State.set(Def, &I, V, Part);
4885       addMetadata(V, &I);
4886   }
4887 }
4888 
4889 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4890                                                  VPUser &Operands,
4891                                                  bool InvariantCond,
4892                                                  VPTransformState &State) {
4893   setDebugLocFromInst(Builder, &I);
4894 
4895   // The condition can be loop invariant  but still defined inside the
4896   // loop. This means that we can't just use the original 'cond' value.
4897   // We have to take the 'vectorized' value and pick the first lane.
4898   // Instcombine will make this a no-op.
4899   auto *InvarCond =
4900       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4901 
4902   for (unsigned Part = 0; Part < UF; ++Part) {
4903     Value *Cond =
4904         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4905     Value *Op0 = State.get(Operands.getOperand(1), Part);
4906     Value *Op1 = State.get(Operands.getOperand(2), Part);
4907     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4908     State.set(VPDef, &I, Sel, Part);
4909     addMetadata(Sel, &I);
4910   }
4911 }
4912 
4913 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4914   // We should not collect Scalars more than once per VF. Right now, this
4915   // function is called from collectUniformsAndScalars(), which already does
4916   // this check. Collecting Scalars for VF=1 does not make any sense.
4917   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4918          "This function should not be visited twice for the same VF");
4919 
4920   SmallSetVector<Instruction *, 8> Worklist;
4921 
4922   // These sets are used to seed the analysis with pointers used by memory
4923   // accesses that will remain scalar.
4924   SmallSetVector<Instruction *, 8> ScalarPtrs;
4925   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4926   auto *Latch = TheLoop->getLoopLatch();
4927 
4928   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4929   // The pointer operands of loads and stores will be scalar as long as the
4930   // memory access is not a gather or scatter operation. The value operand of a
4931   // store will remain scalar if the store is scalarized.
4932   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4933     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4934     assert(WideningDecision != CM_Unknown &&
4935            "Widening decision should be ready at this moment");
4936     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4937       if (Ptr == Store->getValueOperand())
4938         return WideningDecision == CM_Scalarize;
4939     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4940            "Ptr is neither a value or pointer operand");
4941     return WideningDecision != CM_GatherScatter;
4942   };
4943 
4944   // A helper that returns true if the given value is a bitcast or
4945   // getelementptr instruction contained in the loop.
4946   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4947     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4948             isa<GetElementPtrInst>(V)) &&
4949            !TheLoop->isLoopInvariant(V);
4950   };
4951 
4952   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4953     if (!isa<PHINode>(Ptr) ||
4954         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4955       return false;
4956     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4957     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4958       return false;
4959     return isScalarUse(MemAccess, Ptr);
4960   };
4961 
4962   // A helper that evaluates a memory access's use of a pointer. If the
4963   // pointer is actually the pointer induction of a loop, it is being
4964   // inserted into Worklist. If the use will be a scalar use, and the
4965   // pointer is only used by memory accesses, we place the pointer in
4966   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4967   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4968     if (isScalarPtrInduction(MemAccess, Ptr)) {
4969       Worklist.insert(cast<Instruction>(Ptr));
4970       Instruction *Update = cast<Instruction>(
4971           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4972       Worklist.insert(Update);
4973       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4974                         << "\n");
4975       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4976                         << "\n");
4977       return;
4978     }
4979     // We only care about bitcast and getelementptr instructions contained in
4980     // the loop.
4981     if (!isLoopVaryingBitCastOrGEP(Ptr))
4982       return;
4983 
4984     // If the pointer has already been identified as scalar (e.g., if it was
4985     // also identified as uniform), there's nothing to do.
4986     auto *I = cast<Instruction>(Ptr);
4987     if (Worklist.count(I))
4988       return;
4989 
4990     // If the use of the pointer will be a scalar use, and all users of the
4991     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4992     // place the pointer in PossibleNonScalarPtrs.
4993     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4994           return isa<LoadInst>(U) || isa<StoreInst>(U);
4995         }))
4996       ScalarPtrs.insert(I);
4997     else
4998       PossibleNonScalarPtrs.insert(I);
4999   };
5000 
5001   // We seed the scalars analysis with three classes of instructions: (1)
5002   // instructions marked uniform-after-vectorization and (2) bitcast,
5003   // getelementptr and (pointer) phi instructions used by memory accesses
5004   // requiring a scalar use.
5005   //
5006   // (1) Add to the worklist all instructions that have been identified as
5007   // uniform-after-vectorization.
5008   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5009 
5010   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5011   // memory accesses requiring a scalar use. The pointer operands of loads and
5012   // stores will be scalar as long as the memory accesses is not a gather or
5013   // scatter operation. The value operand of a store will remain scalar if the
5014   // store is scalarized.
5015   for (auto *BB : TheLoop->blocks())
5016     for (auto &I : *BB) {
5017       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5018         evaluatePtrUse(Load, Load->getPointerOperand());
5019       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5020         evaluatePtrUse(Store, Store->getPointerOperand());
5021         evaluatePtrUse(Store, Store->getValueOperand());
5022       }
5023     }
5024   for (auto *I : ScalarPtrs)
5025     if (!PossibleNonScalarPtrs.count(I)) {
5026       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5027       Worklist.insert(I);
5028     }
5029 
5030   // Insert the forced scalars.
5031   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5032   // induction variable when the PHI user is scalarized.
5033   auto ForcedScalar = ForcedScalars.find(VF);
5034   if (ForcedScalar != ForcedScalars.end())
5035     for (auto *I : ForcedScalar->second)
5036       Worklist.insert(I);
5037 
5038   // Expand the worklist by looking through any bitcasts and getelementptr
5039   // instructions we've already identified as scalar. This is similar to the
5040   // expansion step in collectLoopUniforms(); however, here we're only
5041   // expanding to include additional bitcasts and getelementptr instructions.
5042   unsigned Idx = 0;
5043   while (Idx != Worklist.size()) {
5044     Instruction *Dst = Worklist[Idx++];
5045     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5046       continue;
5047     auto *Src = cast<Instruction>(Dst->getOperand(0));
5048     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5049           auto *J = cast<Instruction>(U);
5050           return !TheLoop->contains(J) || Worklist.count(J) ||
5051                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5052                   isScalarUse(J, Src));
5053         })) {
5054       Worklist.insert(Src);
5055       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5056     }
5057   }
5058 
5059   // An induction variable will remain scalar if all users of the induction
5060   // variable and induction variable update remain scalar.
5061   for (auto &Induction : Legal->getInductionVars()) {
5062     auto *Ind = Induction.first;
5063     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5064 
5065     // If tail-folding is applied, the primary induction variable will be used
5066     // to feed a vector compare.
5067     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5068       continue;
5069 
5070     // Determine if all users of the induction variable are scalar after
5071     // vectorization.
5072     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5073       auto *I = cast<Instruction>(U);
5074       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5075     });
5076     if (!ScalarInd)
5077       continue;
5078 
5079     // Determine if all users of the induction variable update instruction are
5080     // scalar after vectorization.
5081     auto ScalarIndUpdate =
5082         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5083           auto *I = cast<Instruction>(U);
5084           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5085         });
5086     if (!ScalarIndUpdate)
5087       continue;
5088 
5089     // The induction variable and its update instruction will remain scalar.
5090     Worklist.insert(Ind);
5091     Worklist.insert(IndUpdate);
5092     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5093     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5094                       << "\n");
5095   }
5096 
5097   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5098 }
5099 
5100 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
5101                                                          ElementCount VF) {
5102   assert(!VF.isScalable() && "scalable vectors not yet supported.");
5103   if (!blockNeedsPredication(I->getParent()))
5104     return false;
5105   switch(I->getOpcode()) {
5106   default:
5107     break;
5108   case Instruction::Load:
5109   case Instruction::Store: {
5110     if (!Legal->isMaskRequired(I))
5111       return false;
5112     auto *Ptr = getLoadStorePointerOperand(I);
5113     auto *Ty = getMemInstValueType(I);
5114     // We have already decided how to vectorize this instruction, get that
5115     // result.
5116     if (VF.isVector()) {
5117       InstWidening WideningDecision = getWideningDecision(I, VF);
5118       assert(WideningDecision != CM_Unknown &&
5119              "Widening decision should be ready at this moment");
5120       return WideningDecision == CM_Scalarize;
5121     }
5122     const Align Alignment = getLoadStoreAlignment(I);
5123     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5124                                 isLegalMaskedGather(Ty, Alignment))
5125                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5126                                 isLegalMaskedScatter(Ty, Alignment));
5127   }
5128   case Instruction::UDiv:
5129   case Instruction::SDiv:
5130   case Instruction::SRem:
5131   case Instruction::URem:
5132     return mayDivideByZero(*I);
5133   }
5134   return false;
5135 }
5136 
5137 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5138     Instruction *I, ElementCount VF) {
5139   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5140   assert(getWideningDecision(I, VF) == CM_Unknown &&
5141          "Decision should not be set yet.");
5142   auto *Group = getInterleavedAccessGroup(I);
5143   assert(Group && "Must have a group.");
5144 
5145   // If the instruction's allocated size doesn't equal it's type size, it
5146   // requires padding and will be scalarized.
5147   auto &DL = I->getModule()->getDataLayout();
5148   auto *ScalarTy = getMemInstValueType(I);
5149   if (hasIrregularType(ScalarTy, DL, VF))
5150     return false;
5151 
5152   // Check if masking is required.
5153   // A Group may need masking for one of two reasons: it resides in a block that
5154   // needs predication, or it was decided to use masking to deal with gaps.
5155   bool PredicatedAccessRequiresMasking =
5156       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5157   bool AccessWithGapsRequiresMasking =
5158       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5159   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5160     return true;
5161 
5162   // If masked interleaving is required, we expect that the user/target had
5163   // enabled it, because otherwise it either wouldn't have been created or
5164   // it should have been invalidated by the CostModel.
5165   assert(useMaskedInterleavedAccesses(TTI) &&
5166          "Masked interleave-groups for predicated accesses are not enabled.");
5167 
5168   auto *Ty = getMemInstValueType(I);
5169   const Align Alignment = getLoadStoreAlignment(I);
5170   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5171                           : TTI.isLegalMaskedStore(Ty, Alignment);
5172 }
5173 
5174 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5175     Instruction *I, ElementCount VF) {
5176   // Get and ensure we have a valid memory instruction.
5177   LoadInst *LI = dyn_cast<LoadInst>(I);
5178   StoreInst *SI = dyn_cast<StoreInst>(I);
5179   assert((LI || SI) && "Invalid memory instruction");
5180 
5181   auto *Ptr = getLoadStorePointerOperand(I);
5182 
5183   // In order to be widened, the pointer should be consecutive, first of all.
5184   if (!Legal->isConsecutivePtr(Ptr))
5185     return false;
5186 
5187   // If the instruction is a store located in a predicated block, it will be
5188   // scalarized.
5189   if (isScalarWithPredication(I))
5190     return false;
5191 
5192   // If the instruction's allocated size doesn't equal it's type size, it
5193   // requires padding and will be scalarized.
5194   auto &DL = I->getModule()->getDataLayout();
5195   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5196   if (hasIrregularType(ScalarTy, DL, VF))
5197     return false;
5198 
5199   return true;
5200 }
5201 
5202 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5203   // We should not collect Uniforms more than once per VF. Right now,
5204   // this function is called from collectUniformsAndScalars(), which
5205   // already does this check. Collecting Uniforms for VF=1 does not make any
5206   // sense.
5207 
5208   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5209          "This function should not be visited twice for the same VF");
5210 
5211   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5212   // not analyze again.  Uniforms.count(VF) will return 1.
5213   Uniforms[VF].clear();
5214 
5215   // We now know that the loop is vectorizable!
5216   // Collect instructions inside the loop that will remain uniform after
5217   // vectorization.
5218 
5219   // Global values, params and instructions outside of current loop are out of
5220   // scope.
5221   auto isOutOfScope = [&](Value *V) -> bool {
5222     Instruction *I = dyn_cast<Instruction>(V);
5223     return (!I || !TheLoop->contains(I));
5224   };
5225 
5226   SetVector<Instruction *> Worklist;
5227   BasicBlock *Latch = TheLoop->getLoopLatch();
5228 
5229   // Instructions that are scalar with predication must not be considered
5230   // uniform after vectorization, because that would create an erroneous
5231   // replicating region where only a single instance out of VF should be formed.
5232   // TODO: optimize such seldom cases if found important, see PR40816.
5233   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5234     if (isOutOfScope(I)) {
5235       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5236                         << *I << "\n");
5237       return;
5238     }
5239     if (isScalarWithPredication(I, VF)) {
5240       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5241                         << *I << "\n");
5242       return;
5243     }
5244     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5245     Worklist.insert(I);
5246   };
5247 
5248   // Start with the conditional branch. If the branch condition is an
5249   // instruction contained in the loop that is only used by the branch, it is
5250   // uniform.
5251   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5252   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5253     addToWorklistIfAllowed(Cmp);
5254 
5255   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5256   // are pointers that are treated like consecutive pointers during
5257   // vectorization. The pointer operands of interleaved accesses are an
5258   // example.
5259   SmallSetVector<Value *, 8> ConsecutiveLikePtrs;
5260 
5261   // Holds pointer operands of instructions that are possibly non-uniform.
5262   SmallPtrSet<Value *, 8> PossibleNonUniformPtrs;
5263 
5264   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5265     InstWidening WideningDecision = getWideningDecision(I, VF);
5266     assert(WideningDecision != CM_Unknown &&
5267            "Widening decision should be ready at this moment");
5268 
5269     // The address of a uniform mem op is itself uniform.  We exclude stores
5270     // here as there's an assumption in the current code that all uses of
5271     // uniform instructions are uniform and, as noted below, uniform stores are
5272     // still handled via replication (i.e. aren't uniform after vectorization).
5273     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5274       assert(WideningDecision == CM_Scalarize);
5275       return true;
5276     }
5277 
5278     return (WideningDecision == CM_Widen ||
5279             WideningDecision == CM_Widen_Reverse ||
5280             WideningDecision == CM_Interleave);
5281   };
5282 
5283 
5284   // Returns true if Ptr is the pointer operand of a memory access instruction
5285   // I, and I is known to not require scalarization.
5286   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5287     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5288   };
5289 
5290   // Iterate over the instructions in the loop, and collect all
5291   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5292   // that a consecutive-like pointer operand will be scalarized, we collect it
5293   // in PossibleNonUniformPtrs instead. We use two sets here because a single
5294   // getelementptr instruction can be used by both vectorized and scalarized
5295   // memory instructions. For example, if a loop loads and stores from the same
5296   // location, but the store is conditional, the store will be scalarized, and
5297   // the getelementptr won't remain uniform.
5298   for (auto *BB : TheLoop->blocks())
5299     for (auto &I : *BB) {
5300       // If there's no pointer operand, there's nothing to do.
5301       auto *Ptr = getLoadStorePointerOperand(&I);
5302       if (!Ptr)
5303         continue;
5304 
5305       // For now, avoid walking use lists in other functions.
5306       // TODO: Rewrite this algorithm from uses up.
5307       if (!isa<Instruction>(Ptr) && !isa<Argument>(Ptr))
5308         continue;
5309 
5310       // A uniform memory op is itself uniform.  We exclude stores here as we
5311       // haven't yet added dedicated logic in the CLONE path and rely on
5312       // REPLICATE + DSE for correctness.
5313       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5314         addToWorklistIfAllowed(&I);
5315 
5316       // True if all users of Ptr are memory accesses that have Ptr as their
5317       // pointer operand.  Since loops are assumed to be in LCSSA form, this
5318       // disallows uses outside the loop as well.
5319       auto UsersAreMemAccesses =
5320           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
5321             return getLoadStorePointerOperand(U) == Ptr;
5322           });
5323 
5324       // Ensure the memory instruction will not be scalarized or used by
5325       // gather/scatter, making its pointer operand non-uniform. If the pointer
5326       // operand is used by any instruction other than a memory access, we
5327       // conservatively assume the pointer operand may be non-uniform.
5328       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
5329         PossibleNonUniformPtrs.insert(Ptr);
5330 
5331       // If the memory instruction will be vectorized and its pointer operand
5332       // is consecutive-like, or interleaving - the pointer operand should
5333       // remain uniform.
5334       else
5335         ConsecutiveLikePtrs.insert(Ptr);
5336     }
5337 
5338   // Add to the Worklist all consecutive and consecutive-like pointers that
5339   // aren't also identified as possibly non-uniform.
5340   for (auto *V : ConsecutiveLikePtrs)
5341     if (!PossibleNonUniformPtrs.count(V))
5342       if (auto *I = dyn_cast<Instruction>(V))
5343         addToWorklistIfAllowed(I);
5344 
5345   // Expand Worklist in topological order: whenever a new instruction
5346   // is added , its users should be already inside Worklist.  It ensures
5347   // a uniform instruction will only be used by uniform instructions.
5348   unsigned idx = 0;
5349   while (idx != Worklist.size()) {
5350     Instruction *I = Worklist[idx++];
5351 
5352     for (auto OV : I->operand_values()) {
5353       // isOutOfScope operands cannot be uniform instructions.
5354       if (isOutOfScope(OV))
5355         continue;
5356       // First order recurrence Phi's should typically be considered
5357       // non-uniform.
5358       auto *OP = dyn_cast<PHINode>(OV);
5359       if (OP && Legal->isFirstOrderRecurrence(OP))
5360         continue;
5361       // If all the users of the operand are uniform, then add the
5362       // operand into the uniform worklist.
5363       auto *OI = cast<Instruction>(OV);
5364       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5365             auto *J = cast<Instruction>(U);
5366             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5367           }))
5368         addToWorklistIfAllowed(OI);
5369     }
5370   }
5371 
5372   // For an instruction to be added into Worklist above, all its users inside
5373   // the loop should also be in Worklist. However, this condition cannot be
5374   // true for phi nodes that form a cyclic dependence. We must process phi
5375   // nodes separately. An induction variable will remain uniform if all users
5376   // of the induction variable and induction variable update remain uniform.
5377   // The code below handles both pointer and non-pointer induction variables.
5378   for (auto &Induction : Legal->getInductionVars()) {
5379     auto *Ind = Induction.first;
5380     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5381 
5382     // Determine if all users of the induction variable are uniform after
5383     // vectorization.
5384     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5385       auto *I = cast<Instruction>(U);
5386       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5387              isVectorizedMemAccessUse(I, Ind);
5388     });
5389     if (!UniformInd)
5390       continue;
5391 
5392     // Determine if all users of the induction variable update instruction are
5393     // uniform after vectorization.
5394     auto UniformIndUpdate =
5395         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5396           auto *I = cast<Instruction>(U);
5397           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5398                  isVectorizedMemAccessUse(I, IndUpdate);
5399         });
5400     if (!UniformIndUpdate)
5401       continue;
5402 
5403     // The induction variable and its update instruction will remain uniform.
5404     addToWorklistIfAllowed(Ind);
5405     addToWorklistIfAllowed(IndUpdate);
5406   }
5407 
5408   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5409 }
5410 
5411 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5412   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5413 
5414   if (Legal->getRuntimePointerChecking()->Need) {
5415     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5416         "runtime pointer checks needed. Enable vectorization of this "
5417         "loop with '#pragma clang loop vectorize(enable)' when "
5418         "compiling with -Os/-Oz",
5419         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5420     return true;
5421   }
5422 
5423   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5424     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5425         "runtime SCEV checks needed. Enable vectorization of this "
5426         "loop with '#pragma clang loop vectorize(enable)' when "
5427         "compiling with -Os/-Oz",
5428         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5429     return true;
5430   }
5431 
5432   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5433   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5434     reportVectorizationFailure("Runtime stride check for small trip count",
5435         "runtime stride == 1 checks needed. Enable vectorization of "
5436         "this loop without such check by compiling with -Os/-Oz",
5437         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5438     return true;
5439   }
5440 
5441   return false;
5442 }
5443 
5444 Optional<ElementCount>
5445 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5446   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5447     // TODO: It may by useful to do since it's still likely to be dynamically
5448     // uniform if the target can skip.
5449     reportVectorizationFailure(
5450         "Not inserting runtime ptr check for divergent target",
5451         "runtime pointer checks needed. Not enabled for divergent target",
5452         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5453     return None;
5454   }
5455 
5456   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5457   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5458   if (TC == 1) {
5459     reportVectorizationFailure("Single iteration (non) loop",
5460         "loop trip count is one, irrelevant for vectorization",
5461         "SingleIterationLoop", ORE, TheLoop);
5462     return None;
5463   }
5464 
5465   ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5466 
5467   switch (ScalarEpilogueStatus) {
5468   case CM_ScalarEpilogueAllowed:
5469     return MaxVF;
5470   case CM_ScalarEpilogueNotNeededUsePredicate:
5471     LLVM_DEBUG(
5472         dbgs() << "LV: vector predicate hint/switch found.\n"
5473                << "LV: Not allowing scalar epilogue, creating predicated "
5474                << "vector loop.\n");
5475     break;
5476   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5477     // fallthrough as a special case of OptForSize
5478   case CM_ScalarEpilogueNotAllowedOptSize:
5479     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5480       LLVM_DEBUG(
5481           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5482     else
5483       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5484                         << "count.\n");
5485 
5486     // Bail if runtime checks are required, which are not good when optimising
5487     // for size.
5488     if (runtimeChecksRequired())
5489       return None;
5490     break;
5491   }
5492 
5493   // Now try the tail folding
5494 
5495   // Invalidate interleave groups that require an epilogue if we can't mask
5496   // the interleave-group.
5497   if (!useMaskedInterleavedAccesses(TTI)) {
5498     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5499            "No decisions should have been taken at this point");
5500     // Note: There is no need to invalidate any cost modeling decisions here, as
5501     // non where taken so far.
5502     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5503   }
5504 
5505   assert(!MaxVF.isScalable() &&
5506          "Scalable vectors do not yet support tail folding");
5507   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5508          "MaxVF must be a power of 2");
5509   unsigned MaxVFtimesIC =
5510       UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5511   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5512     // Accept MaxVF if we do not have a tail.
5513     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5514     return MaxVF;
5515   }
5516 
5517   // If we don't know the precise trip count, or if the trip count that we
5518   // found modulo the vectorization factor is not zero, try to fold the tail
5519   // by masking.
5520   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5521   if (Legal->prepareToFoldTailByMasking()) {
5522     FoldTailByMasking = true;
5523     return MaxVF;
5524   }
5525 
5526   // If there was a tail-folding hint/switch, but we can't fold the tail by
5527   // masking, fallback to a vectorization with a scalar epilogue.
5528   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5529     if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) {
5530       LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5531       return None;
5532     }
5533     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5534                          "scalar epilogue instead.\n");
5535     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5536     return MaxVF;
5537   }
5538 
5539   if (TC == 0) {
5540     reportVectorizationFailure(
5541         "Unable to calculate the loop count due to complex control flow",
5542         "unable to calculate the loop count due to complex control flow",
5543         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5544     return None;
5545   }
5546 
5547   reportVectorizationFailure(
5548       "Cannot optimize for size and vectorize at the same time.",
5549       "cannot optimize for size and vectorize at the same time. "
5550       "Enable vectorization of this loop with '#pragma clang loop "
5551       "vectorize(enable)' when compiling with -Os/-Oz",
5552       "NoTailLoopWithOptForSize", ORE, TheLoop);
5553   return None;
5554 }
5555 
5556 ElementCount
5557 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5558                                                  ElementCount UserVF) {
5559   assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
5560   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5561   unsigned SmallestType, WidestType;
5562   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5563   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5564 
5565   // Get the maximum safe dependence distance in bits computed by LAA.
5566   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5567   // the memory accesses that is most restrictive (involved in the smallest
5568   // dependence distance).
5569   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5570 
5571   if (UserVF.isNonZero()) {
5572     // If legally unsafe, clamp the user vectorization factor to a safe value.
5573     unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5574     if (UserVF.getFixedValue() <= MaxSafeVF)
5575       return UserVF;
5576 
5577     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5578                       << " is unsafe, clamping to max safe VF=" << MaxSafeVF
5579                       << ".\n");
5580     ORE->emit([&]() {
5581       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5582                                         TheLoop->getStartLoc(),
5583                                         TheLoop->getHeader())
5584              << "User-specified vectorization factor "
5585              << ore::NV("UserVectorizationFactor", UserVF)
5586              << " is unsafe, clamping to maximum safe vectorization factor "
5587              << ore::NV("VectorizationFactor", MaxSafeVF);
5588     });
5589     return ElementCount::getFixed(MaxSafeVF);
5590   }
5591 
5592   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5593 
5594   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5595   // Note that both WidestRegister and WidestType may not be a powers of 2.
5596   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5597 
5598   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5599                     << " / " << WidestType << " bits.\n");
5600   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5601                     << WidestRegister << " bits.\n");
5602 
5603   assert(MaxVectorSize <= WidestRegister &&
5604          "Did not expect to pack so many elements"
5605          " into one vector!");
5606   if (MaxVectorSize == 0) {
5607     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5608     MaxVectorSize = 1;
5609     return ElementCount::getFixed(MaxVectorSize);
5610   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5611              isPowerOf2_32(ConstTripCount)) {
5612     // We need to clamp the VF to be the ConstTripCount. There is no point in
5613     // choosing a higher viable VF as done in the loop below.
5614     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5615                       << ConstTripCount << "\n");
5616     MaxVectorSize = ConstTripCount;
5617     return ElementCount::getFixed(MaxVectorSize);
5618   }
5619 
5620   unsigned MaxVF = MaxVectorSize;
5621   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5622       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5623     // Collect all viable vectorization factors larger than the default MaxVF
5624     // (i.e. MaxVectorSize).
5625     SmallVector<ElementCount, 8> VFs;
5626     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5627     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5628       VFs.push_back(ElementCount::getFixed(VS));
5629 
5630     // For each VF calculate its register usage.
5631     auto RUs = calculateRegisterUsage(VFs);
5632 
5633     // Select the largest VF which doesn't require more registers than existing
5634     // ones.
5635     for (int i = RUs.size() - 1; i >= 0; --i) {
5636       bool Selected = true;
5637       for (auto& pair : RUs[i].MaxLocalUsers) {
5638         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5639         if (pair.second > TargetNumRegisters)
5640           Selected = false;
5641       }
5642       if (Selected) {
5643         MaxVF = VFs[i].getKnownMinValue();
5644         break;
5645       }
5646     }
5647     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5648       if (MaxVF < MinVF) {
5649         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5650                           << ") with target's minimum: " << MinVF << '\n');
5651         MaxVF = MinVF;
5652       }
5653     }
5654   }
5655   return ElementCount::getFixed(MaxVF);
5656 }
5657 
5658 VectorizationFactor
5659 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5660   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5661 
5662   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5663   const float ScalarCost = Cost;
5664   unsigned Width = 1;
5665   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5666 
5667   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5668   if (ForceVectorization && MaxVF.isVector()) {
5669     // Ignore scalar width, because the user explicitly wants vectorization.
5670     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5671     // evaluation.
5672     Cost = std::numeric_limits<float>::max();
5673   }
5674 
5675   for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
5676     // Notice that the vector loop needs to be executed less times, so
5677     // we need to divide the cost of the vector loops by the width of
5678     // the vector elements.
5679     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5680     float VectorCost = C.first / (float)i;
5681     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5682                       << " costs: " << (int)VectorCost << ".\n");
5683     if (!C.second && !ForceVectorization) {
5684       LLVM_DEBUG(
5685           dbgs() << "LV: Not considering vector loop of width " << i
5686                  << " because it will not generate any vector instructions.\n");
5687       continue;
5688     }
5689 
5690     // If profitable add it to ProfitableVF list.
5691     if (VectorCost < ScalarCost) {
5692       ProfitableVFs.push_back(VectorizationFactor(
5693           {ElementCount::getFixed(i), (unsigned)VectorCost}));
5694     }
5695 
5696     if (VectorCost < Cost) {
5697       Cost = VectorCost;
5698       Width = i;
5699     }
5700   }
5701 
5702   if (!EnableCondStoresVectorization && NumPredStores) {
5703     reportVectorizationFailure("There are conditional stores.",
5704         "store that is conditionally executed prevents vectorization",
5705         "ConditionalStore", ORE, TheLoop);
5706     Width = 1;
5707     Cost = ScalarCost;
5708   }
5709 
5710   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5711              << "LV: Vectorization seems to be not beneficial, "
5712              << "but was forced by a user.\n");
5713   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5714   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5715                                 (unsigned)(Width * Cost)};
5716   return Factor;
5717 }
5718 
5719 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5720     const Loop &L, ElementCount VF) const {
5721   // Cross iteration phis such as reductions need special handling and are
5722   // currently unsupported.
5723   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5724         return Legal->isFirstOrderRecurrence(&Phi) ||
5725                Legal->isReductionVariable(&Phi);
5726       }))
5727     return false;
5728 
5729   // Phis with uses outside of the loop require special handling and are
5730   // currently unsupported.
5731   for (auto &Entry : Legal->getInductionVars()) {
5732     // Look for uses of the value of the induction at the last iteration.
5733     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5734     for (User *U : PostInc->users())
5735       if (!L.contains(cast<Instruction>(U)))
5736         return false;
5737     // Look for uses of penultimate value of the induction.
5738     for (User *U : Entry.first->users())
5739       if (!L.contains(cast<Instruction>(U)))
5740         return false;
5741   }
5742 
5743   // Induction variables that are widened require special handling that is
5744   // currently not supported.
5745   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5746         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5747                  this->isProfitableToScalarize(Entry.first, VF));
5748       }))
5749     return false;
5750 
5751   return true;
5752 }
5753 
5754 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5755     const ElementCount VF) const {
5756   // FIXME: We need a much better cost-model to take different parameters such
5757   // as register pressure, code size increase and cost of extra branches into
5758   // account. For now we apply a very crude heuristic and only consider loops
5759   // with vectorization factors larger than a certain value.
5760   // We also consider epilogue vectorization unprofitable for targets that don't
5761   // consider interleaving beneficial (eg. MVE).
5762   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5763     return false;
5764   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5765     return true;
5766   return false;
5767 }
5768 
5769 VectorizationFactor
5770 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5771     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5772   VectorizationFactor Result = VectorizationFactor::Disabled();
5773   if (!EnableEpilogueVectorization) {
5774     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5775     return Result;
5776   }
5777 
5778   if (!isScalarEpilogueAllowed()) {
5779     LLVM_DEBUG(
5780         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5781                   "allowed.\n";);
5782     return Result;
5783   }
5784 
5785   // Not really a cost consideration, but check for unsupported cases here to
5786   // simplify the logic.
5787   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5788     LLVM_DEBUG(
5789         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5790                   "not a supported candidate.\n";);
5791     return Result;
5792   }
5793 
5794   if (EpilogueVectorizationForceVF > 1) {
5795     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5796     if (LVP.hasPlanWithVFs(
5797             {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
5798       return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
5799     else {
5800       LLVM_DEBUG(
5801           dbgs()
5802               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5803       return Result;
5804     }
5805   }
5806 
5807   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5808       TheLoop->getHeader()->getParent()->hasMinSize()) {
5809     LLVM_DEBUG(
5810         dbgs()
5811             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5812     return Result;
5813   }
5814 
5815   if (!isEpilogueVectorizationProfitable(MainLoopVF))
5816     return Result;
5817 
5818   for (auto &NextVF : ProfitableVFs)
5819     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
5820         (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
5821         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
5822       Result = NextVF;
5823 
5824   if (Result != VectorizationFactor::Disabled())
5825     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5826                       << Result.Width.getFixedValue() << "\n";);
5827   return Result;
5828 }
5829 
5830 std::pair<unsigned, unsigned>
5831 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5832   unsigned MinWidth = -1U;
5833   unsigned MaxWidth = 8;
5834   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5835 
5836   // For each block.
5837   for (BasicBlock *BB : TheLoop->blocks()) {
5838     // For each instruction in the loop.
5839     for (Instruction &I : BB->instructionsWithoutDebug()) {
5840       Type *T = I.getType();
5841 
5842       // Skip ignored values.
5843       if (ValuesToIgnore.count(&I))
5844         continue;
5845 
5846       // Only examine Loads, Stores and PHINodes.
5847       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5848         continue;
5849 
5850       // Examine PHI nodes that are reduction variables. Update the type to
5851       // account for the recurrence type.
5852       if (auto *PN = dyn_cast<PHINode>(&I)) {
5853         if (!Legal->isReductionVariable(PN))
5854           continue;
5855         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5856         T = RdxDesc.getRecurrenceType();
5857       }
5858 
5859       // Examine the stored values.
5860       if (auto *ST = dyn_cast<StoreInst>(&I))
5861         T = ST->getValueOperand()->getType();
5862 
5863       // Ignore loaded pointer types and stored pointer types that are not
5864       // vectorizable.
5865       //
5866       // FIXME: The check here attempts to predict whether a load or store will
5867       //        be vectorized. We only know this for certain after a VF has
5868       //        been selected. Here, we assume that if an access can be
5869       //        vectorized, it will be. We should also look at extending this
5870       //        optimization to non-pointer types.
5871       //
5872       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5873           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5874         continue;
5875 
5876       MinWidth = std::min(MinWidth,
5877                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5878       MaxWidth = std::max(MaxWidth,
5879                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5880     }
5881   }
5882 
5883   return {MinWidth, MaxWidth};
5884 }
5885 
5886 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5887                                                            unsigned LoopCost) {
5888   // -- The interleave heuristics --
5889   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5890   // There are many micro-architectural considerations that we can't predict
5891   // at this level. For example, frontend pressure (on decode or fetch) due to
5892   // code size, or the number and capabilities of the execution ports.
5893   //
5894   // We use the following heuristics to select the interleave count:
5895   // 1. If the code has reductions, then we interleave to break the cross
5896   // iteration dependency.
5897   // 2. If the loop is really small, then we interleave to reduce the loop
5898   // overhead.
5899   // 3. We don't interleave if we think that we will spill registers to memory
5900   // due to the increased register pressure.
5901 
5902   if (!isScalarEpilogueAllowed())
5903     return 1;
5904 
5905   // We used the distance for the interleave count.
5906   if (Legal->getMaxSafeDepDistBytes() != -1U)
5907     return 1;
5908 
5909   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5910   const bool HasReductions = !Legal->getReductionVars().empty();
5911   // Do not interleave loops with a relatively small known or estimated trip
5912   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5913   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5914   // because with the above conditions interleaving can expose ILP and break
5915   // cross iteration dependences for reductions.
5916   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5917       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5918     return 1;
5919 
5920   RegisterUsage R = calculateRegisterUsage({VF})[0];
5921   // We divide by these constants so assume that we have at least one
5922   // instruction that uses at least one register.
5923   for (auto& pair : R.MaxLocalUsers) {
5924     pair.second = std::max(pair.second, 1U);
5925   }
5926 
5927   // We calculate the interleave count using the following formula.
5928   // Subtract the number of loop invariants from the number of available
5929   // registers. These registers are used by all of the interleaved instances.
5930   // Next, divide the remaining registers by the number of registers that is
5931   // required by the loop, in order to estimate how many parallel instances
5932   // fit without causing spills. All of this is rounded down if necessary to be
5933   // a power of two. We want power of two interleave count to simplify any
5934   // addressing operations or alignment considerations.
5935   // We also want power of two interleave counts to ensure that the induction
5936   // variable of the vector loop wraps to zero, when tail is folded by masking;
5937   // this currently happens when OptForSize, in which case IC is set to 1 above.
5938   unsigned IC = UINT_MAX;
5939 
5940   for (auto& pair : R.MaxLocalUsers) {
5941     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5942     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5943                       << " registers of "
5944                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5945     if (VF.isScalar()) {
5946       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5947         TargetNumRegisters = ForceTargetNumScalarRegs;
5948     } else {
5949       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5950         TargetNumRegisters = ForceTargetNumVectorRegs;
5951     }
5952     unsigned MaxLocalUsers = pair.second;
5953     unsigned LoopInvariantRegs = 0;
5954     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5955       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5956 
5957     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5958     // Don't count the induction variable as interleaved.
5959     if (EnableIndVarRegisterHeur) {
5960       TmpIC =
5961           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5962                         std::max(1U, (MaxLocalUsers - 1)));
5963     }
5964 
5965     IC = std::min(IC, TmpIC);
5966   }
5967 
5968   // Clamp the interleave ranges to reasonable counts.
5969   assert(!VF.isScalable() && "scalable vectors not yet supported.");
5970   unsigned MaxInterleaveCount =
5971       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5972 
5973   // Check if the user has overridden the max.
5974   if (VF.isScalar()) {
5975     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5976       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5977   } else {
5978     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5979       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5980   }
5981 
5982   // If trip count is known or estimated compile time constant, limit the
5983   // interleave count to be less than the trip count divided by VF, provided it
5984   // is at least 1.
5985   if (BestKnownTC) {
5986     MaxInterleaveCount =
5987         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5988     // Make sure MaxInterleaveCount is greater than 0.
5989     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5990   }
5991 
5992   assert(MaxInterleaveCount > 0 &&
5993          "Maximum interleave count must be greater than 0");
5994 
5995   // Clamp the calculated IC to be between the 1 and the max interleave count
5996   // that the target and trip count allows.
5997   if (IC > MaxInterleaveCount)
5998     IC = MaxInterleaveCount;
5999   else
6000     // Make sure IC is greater than 0.
6001     IC = std::max(1u, IC);
6002 
6003   assert(IC > 0 && "Interleave count must be greater than 0.");
6004 
6005   // If we did not calculate the cost for VF (because the user selected the VF)
6006   // then we calculate the cost of VF here.
6007   if (LoopCost == 0)
6008     LoopCost = expectedCost(VF).first;
6009 
6010   assert(LoopCost && "Non-zero loop cost expected");
6011 
6012   // Interleave if we vectorized this loop and there is a reduction that could
6013   // benefit from interleaving.
6014   if (VF.isVector() && HasReductions) {
6015     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6016     return IC;
6017   }
6018 
6019   // Note that if we've already vectorized the loop we will have done the
6020   // runtime check and so interleaving won't require further checks.
6021   bool InterleavingRequiresRuntimePointerCheck =
6022       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6023 
6024   // We want to interleave small loops in order to reduce the loop overhead and
6025   // potentially expose ILP opportunities.
6026   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6027                     << "LV: IC is " << IC << '\n'
6028                     << "LV: VF is " << VF.getKnownMinValue() << '\n');
6029   const bool AggressivelyInterleaveReductions =
6030       TTI.enableAggressiveInterleaving(HasReductions);
6031   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6032     // We assume that the cost overhead is 1 and we use the cost model
6033     // to estimate the cost of the loop and interleave until the cost of the
6034     // loop overhead is about 5% of the cost of the loop.
6035     unsigned SmallIC =
6036         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6037 
6038     // Interleave until store/load ports (estimated by max interleave count) are
6039     // saturated.
6040     unsigned NumStores = Legal->getNumStores();
6041     unsigned NumLoads = Legal->getNumLoads();
6042     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6043     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6044 
6045     // If we have a scalar reduction (vector reductions are already dealt with
6046     // by this point), we can increase the critical path length if the loop
6047     // we're interleaving is inside another loop. Limit, by default to 2, so the
6048     // critical path only gets increased by one reduction operation.
6049     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6050       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6051       SmallIC = std::min(SmallIC, F);
6052       StoresIC = std::min(StoresIC, F);
6053       LoadsIC = std::min(LoadsIC, F);
6054     }
6055 
6056     if (EnableLoadStoreRuntimeInterleave &&
6057         std::max(StoresIC, LoadsIC) > SmallIC) {
6058       LLVM_DEBUG(
6059           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6060       return std::max(StoresIC, LoadsIC);
6061     }
6062 
6063     // If there are scalar reductions and TTI has enabled aggressive
6064     // interleaving for reductions, we will interleave to expose ILP.
6065     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6066         AggressivelyInterleaveReductions) {
6067       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6068       // Interleave no less than SmallIC but not as aggressive as the normal IC
6069       // to satisfy the rare situation when resources are too limited.
6070       return std::max(IC / 2, SmallIC);
6071     } else {
6072       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6073       return SmallIC;
6074     }
6075   }
6076 
6077   // Interleave if this is a large loop (small loops are already dealt with by
6078   // this point) that could benefit from interleaving.
6079   if (AggressivelyInterleaveReductions) {
6080     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6081     return IC;
6082   }
6083 
6084   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6085   return 1;
6086 }
6087 
6088 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6089 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6090   // This function calculates the register usage by measuring the highest number
6091   // of values that are alive at a single location. Obviously, this is a very
6092   // rough estimation. We scan the loop in a topological order in order and
6093   // assign a number to each instruction. We use RPO to ensure that defs are
6094   // met before their users. We assume that each instruction that has in-loop
6095   // users starts an interval. We record every time that an in-loop value is
6096   // used, so we have a list of the first and last occurrences of each
6097   // instruction. Next, we transpose this data structure into a multi map that
6098   // holds the list of intervals that *end* at a specific location. This multi
6099   // map allows us to perform a linear search. We scan the instructions linearly
6100   // and record each time that a new interval starts, by placing it in a set.
6101   // If we find this value in the multi-map then we remove it from the set.
6102   // The max register usage is the maximum size of the set.
6103   // We also search for instructions that are defined outside the loop, but are
6104   // used inside the loop. We need this number separately from the max-interval
6105   // usage number because when we unroll, loop-invariant values do not take
6106   // more register.
6107   LoopBlocksDFS DFS(TheLoop);
6108   DFS.perform(LI);
6109 
6110   RegisterUsage RU;
6111 
6112   // Each 'key' in the map opens a new interval. The values
6113   // of the map are the index of the 'last seen' usage of the
6114   // instruction that is the key.
6115   using IntervalMap = DenseMap<Instruction *, unsigned>;
6116 
6117   // Maps instruction to its index.
6118   SmallVector<Instruction *, 64> IdxToInstr;
6119   // Marks the end of each interval.
6120   IntervalMap EndPoint;
6121   // Saves the list of instruction indices that are used in the loop.
6122   SmallPtrSet<Instruction *, 8> Ends;
6123   // Saves the list of values that are used in the loop but are
6124   // defined outside the loop, such as arguments and constants.
6125   SmallPtrSet<Value *, 8> LoopInvariants;
6126 
6127   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6128     for (Instruction &I : BB->instructionsWithoutDebug()) {
6129       IdxToInstr.push_back(&I);
6130 
6131       // Save the end location of each USE.
6132       for (Value *U : I.operands()) {
6133         auto *Instr = dyn_cast<Instruction>(U);
6134 
6135         // Ignore non-instruction values such as arguments, constants, etc.
6136         if (!Instr)
6137           continue;
6138 
6139         // If this instruction is outside the loop then record it and continue.
6140         if (!TheLoop->contains(Instr)) {
6141           LoopInvariants.insert(Instr);
6142           continue;
6143         }
6144 
6145         // Overwrite previous end points.
6146         EndPoint[Instr] = IdxToInstr.size();
6147         Ends.insert(Instr);
6148       }
6149     }
6150   }
6151 
6152   // Saves the list of intervals that end with the index in 'key'.
6153   using InstrList = SmallVector<Instruction *, 2>;
6154   DenseMap<unsigned, InstrList> TransposeEnds;
6155 
6156   // Transpose the EndPoints to a list of values that end at each index.
6157   for (auto &Interval : EndPoint)
6158     TransposeEnds[Interval.second].push_back(Interval.first);
6159 
6160   SmallPtrSet<Instruction *, 8> OpenIntervals;
6161   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6162   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6163 
6164   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6165 
6166   // A lambda that gets the register usage for the given type and VF.
6167   const auto &TTICapture = TTI;
6168   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6169     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6170       return 0U;
6171     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6172   };
6173 
6174   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6175     Instruction *I = IdxToInstr[i];
6176 
6177     // Remove all of the instructions that end at this location.
6178     InstrList &List = TransposeEnds[i];
6179     for (Instruction *ToRemove : List)
6180       OpenIntervals.erase(ToRemove);
6181 
6182     // Ignore instructions that are never used within the loop.
6183     if (!Ends.count(I))
6184       continue;
6185 
6186     // Skip ignored values.
6187     if (ValuesToIgnore.count(I))
6188       continue;
6189 
6190     // For each VF find the maximum usage of registers.
6191     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6192       // Count the number of live intervals.
6193       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6194 
6195       if (VFs[j].isScalar()) {
6196         for (auto Inst : OpenIntervals) {
6197           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6198           if (RegUsage.find(ClassID) == RegUsage.end())
6199             RegUsage[ClassID] = 1;
6200           else
6201             RegUsage[ClassID] += 1;
6202         }
6203       } else {
6204         collectUniformsAndScalars(VFs[j]);
6205         for (auto Inst : OpenIntervals) {
6206           // Skip ignored values for VF > 1.
6207           if (VecValuesToIgnore.count(Inst))
6208             continue;
6209           if (isScalarAfterVectorization(Inst, VFs[j])) {
6210             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6211             if (RegUsage.find(ClassID) == RegUsage.end())
6212               RegUsage[ClassID] = 1;
6213             else
6214               RegUsage[ClassID] += 1;
6215           } else {
6216             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6217             if (RegUsage.find(ClassID) == RegUsage.end())
6218               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6219             else
6220               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6221           }
6222         }
6223       }
6224 
6225       for (auto& pair : RegUsage) {
6226         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6227           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6228         else
6229           MaxUsages[j][pair.first] = pair.second;
6230       }
6231     }
6232 
6233     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6234                       << OpenIntervals.size() << '\n');
6235 
6236     // Add the current instruction to the list of open intervals.
6237     OpenIntervals.insert(I);
6238   }
6239 
6240   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6241     SmallMapVector<unsigned, unsigned, 4> Invariant;
6242 
6243     for (auto Inst : LoopInvariants) {
6244       unsigned Usage =
6245           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6246       unsigned ClassID =
6247           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6248       if (Invariant.find(ClassID) == Invariant.end())
6249         Invariant[ClassID] = Usage;
6250       else
6251         Invariant[ClassID] += Usage;
6252     }
6253 
6254     LLVM_DEBUG({
6255       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6256       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6257              << " item\n";
6258       for (const auto &pair : MaxUsages[i]) {
6259         dbgs() << "LV(REG): RegisterClass: "
6260                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6261                << " registers\n";
6262       }
6263       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6264              << " item\n";
6265       for (const auto &pair : Invariant) {
6266         dbgs() << "LV(REG): RegisterClass: "
6267                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6268                << " registers\n";
6269       }
6270     });
6271 
6272     RU.LoopInvariantRegs = Invariant;
6273     RU.MaxLocalUsers = MaxUsages[i];
6274     RUs[i] = RU;
6275   }
6276 
6277   return RUs;
6278 }
6279 
6280 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6281   // TODO: Cost model for emulated masked load/store is completely
6282   // broken. This hack guides the cost model to use an artificially
6283   // high enough value to practically disable vectorization with such
6284   // operations, except where previously deployed legality hack allowed
6285   // using very low cost values. This is to avoid regressions coming simply
6286   // from moving "masked load/store" check from legality to cost model.
6287   // Masked Load/Gather emulation was previously never allowed.
6288   // Limited number of Masked Store/Scatter emulation was allowed.
6289   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
6290   return isa<LoadInst>(I) ||
6291          (isa<StoreInst>(I) &&
6292           NumPredStores > NumberOfStoresToPredicate);
6293 }
6294 
6295 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6296   // If we aren't vectorizing the loop, or if we've already collected the
6297   // instructions to scalarize, there's nothing to do. Collection may already
6298   // have occurred if we have a user-selected VF and are now computing the
6299   // expected cost for interleaving.
6300   if (VF.isScalar() || VF.isZero() ||
6301       InstsToScalarize.find(VF) != InstsToScalarize.end())
6302     return;
6303 
6304   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6305   // not profitable to scalarize any instructions, the presence of VF in the
6306   // map will indicate that we've analyzed it already.
6307   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6308 
6309   // Find all the instructions that are scalar with predication in the loop and
6310   // determine if it would be better to not if-convert the blocks they are in.
6311   // If so, we also record the instructions to scalarize.
6312   for (BasicBlock *BB : TheLoop->blocks()) {
6313     if (!blockNeedsPredication(BB))
6314       continue;
6315     for (Instruction &I : *BB)
6316       if (isScalarWithPredication(&I)) {
6317         ScalarCostsTy ScalarCosts;
6318         // Do not apply discount logic if hacked cost is needed
6319         // for emulated masked memrefs.
6320         if (!useEmulatedMaskMemRefHack(&I) &&
6321             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6322           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6323         // Remember that BB will remain after vectorization.
6324         PredicatedBBsAfterVectorization.insert(BB);
6325       }
6326   }
6327 }
6328 
6329 int LoopVectorizationCostModel::computePredInstDiscount(
6330     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
6331     ElementCount VF) {
6332   assert(!isUniformAfterVectorization(PredInst, VF) &&
6333          "Instruction marked uniform-after-vectorization will be predicated");
6334 
6335   // Initialize the discount to zero, meaning that the scalar version and the
6336   // vector version cost the same.
6337   int Discount = 0;
6338 
6339   // Holds instructions to analyze. The instructions we visit are mapped in
6340   // ScalarCosts. Those instructions are the ones that would be scalarized if
6341   // we find that the scalar version costs less.
6342   SmallVector<Instruction *, 8> Worklist;
6343 
6344   // Returns true if the given instruction can be scalarized.
6345   auto canBeScalarized = [&](Instruction *I) -> bool {
6346     // We only attempt to scalarize instructions forming a single-use chain
6347     // from the original predicated block that would otherwise be vectorized.
6348     // Although not strictly necessary, we give up on instructions we know will
6349     // already be scalar to avoid traversing chains that are unlikely to be
6350     // beneficial.
6351     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6352         isScalarAfterVectorization(I, VF))
6353       return false;
6354 
6355     // If the instruction is scalar with predication, it will be analyzed
6356     // separately. We ignore it within the context of PredInst.
6357     if (isScalarWithPredication(I))
6358       return false;
6359 
6360     // If any of the instruction's operands are uniform after vectorization,
6361     // the instruction cannot be scalarized. This prevents, for example, a
6362     // masked load from being scalarized.
6363     //
6364     // We assume we will only emit a value for lane zero of an instruction
6365     // marked uniform after vectorization, rather than VF identical values.
6366     // Thus, if we scalarize an instruction that uses a uniform, we would
6367     // create uses of values corresponding to the lanes we aren't emitting code
6368     // for. This behavior can be changed by allowing getScalarValue to clone
6369     // the lane zero values for uniforms rather than asserting.
6370     for (Use &U : I->operands())
6371       if (auto *J = dyn_cast<Instruction>(U.get()))
6372         if (isUniformAfterVectorization(J, VF))
6373           return false;
6374 
6375     // Otherwise, we can scalarize the instruction.
6376     return true;
6377   };
6378 
6379   // Compute the expected cost discount from scalarizing the entire expression
6380   // feeding the predicated instruction. We currently only consider expressions
6381   // that are single-use instruction chains.
6382   Worklist.push_back(PredInst);
6383   while (!Worklist.empty()) {
6384     Instruction *I = Worklist.pop_back_val();
6385 
6386     // If we've already analyzed the instruction, there's nothing to do.
6387     if (ScalarCosts.find(I) != ScalarCosts.end())
6388       continue;
6389 
6390     // Compute the cost of the vector instruction. Note that this cost already
6391     // includes the scalarization overhead of the predicated instruction.
6392     unsigned VectorCost = getInstructionCost(I, VF).first;
6393 
6394     // Compute the cost of the scalarized instruction. This cost is the cost of
6395     // the instruction as if it wasn't if-converted and instead remained in the
6396     // predicated block. We will scale this cost by block probability after
6397     // computing the scalarization overhead.
6398     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6399     unsigned ScalarCost =
6400         VF.getKnownMinValue() *
6401         getInstructionCost(I, ElementCount::getFixed(1)).first;
6402 
6403     // Compute the scalarization overhead of needed insertelement instructions
6404     // and phi nodes.
6405     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6406       ScalarCost += TTI.getScalarizationOverhead(
6407           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6408           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6409       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6410       ScalarCost +=
6411           VF.getKnownMinValue() *
6412           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6413     }
6414 
6415     // Compute the scalarization overhead of needed extractelement
6416     // instructions. For each of the instruction's operands, if the operand can
6417     // be scalarized, add it to the worklist; otherwise, account for the
6418     // overhead.
6419     for (Use &U : I->operands())
6420       if (auto *J = dyn_cast<Instruction>(U.get())) {
6421         assert(VectorType::isValidElementType(J->getType()) &&
6422                "Instruction has non-scalar type");
6423         if (canBeScalarized(J))
6424           Worklist.push_back(J);
6425         else if (needsExtract(J, VF)) {
6426           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6427           ScalarCost += TTI.getScalarizationOverhead(
6428               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6429               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6430         }
6431       }
6432 
6433     // Scale the total scalar cost by block probability.
6434     ScalarCost /= getReciprocalPredBlockProb();
6435 
6436     // Compute the discount. A non-negative discount means the vector version
6437     // of the instruction costs more, and scalarizing would be beneficial.
6438     Discount += VectorCost - ScalarCost;
6439     ScalarCosts[I] = ScalarCost;
6440   }
6441 
6442   return Discount;
6443 }
6444 
6445 LoopVectorizationCostModel::VectorizationCostTy
6446 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6447   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6448   VectorizationCostTy Cost;
6449 
6450   // For each block.
6451   for (BasicBlock *BB : TheLoop->blocks()) {
6452     VectorizationCostTy BlockCost;
6453 
6454     // For each instruction in the old loop.
6455     for (Instruction &I : BB->instructionsWithoutDebug()) {
6456       // Skip ignored values.
6457       if (ValuesToIgnore.count(&I) ||
6458           (VF.isVector() && VecValuesToIgnore.count(&I)))
6459         continue;
6460 
6461       VectorizationCostTy C = getInstructionCost(&I, VF);
6462 
6463       // Check if we should override the cost.
6464       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6465         C.first = ForceTargetInstructionCost;
6466 
6467       BlockCost.first += C.first;
6468       BlockCost.second |= C.second;
6469       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6470                         << " for VF " << VF << " For instruction: " << I
6471                         << '\n');
6472     }
6473 
6474     // If we are vectorizing a predicated block, it will have been
6475     // if-converted. This means that the block's instructions (aside from
6476     // stores and instructions that may divide by zero) will now be
6477     // unconditionally executed. For the scalar case, we may not always execute
6478     // the predicated block. Thus, scale the block's cost by the probability of
6479     // executing it.
6480     if (VF.isScalar() && blockNeedsPredication(BB))
6481       BlockCost.first /= getReciprocalPredBlockProb();
6482 
6483     Cost.first += BlockCost.first;
6484     Cost.second |= BlockCost.second;
6485   }
6486 
6487   return Cost;
6488 }
6489 
6490 /// Gets Address Access SCEV after verifying that the access pattern
6491 /// is loop invariant except the induction variable dependence.
6492 ///
6493 /// This SCEV can be sent to the Target in order to estimate the address
6494 /// calculation cost.
6495 static const SCEV *getAddressAccessSCEV(
6496               Value *Ptr,
6497               LoopVectorizationLegality *Legal,
6498               PredicatedScalarEvolution &PSE,
6499               const Loop *TheLoop) {
6500 
6501   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6502   if (!Gep)
6503     return nullptr;
6504 
6505   // We are looking for a gep with all loop invariant indices except for one
6506   // which should be an induction variable.
6507   auto SE = PSE.getSE();
6508   unsigned NumOperands = Gep->getNumOperands();
6509   for (unsigned i = 1; i < NumOperands; ++i) {
6510     Value *Opd = Gep->getOperand(i);
6511     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6512         !Legal->isInductionVariable(Opd))
6513       return nullptr;
6514   }
6515 
6516   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6517   return PSE.getSCEV(Ptr);
6518 }
6519 
6520 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6521   return Legal->hasStride(I->getOperand(0)) ||
6522          Legal->hasStride(I->getOperand(1));
6523 }
6524 
6525 unsigned
6526 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6527                                                         ElementCount VF) {
6528   assert(VF.isVector() &&
6529          "Scalarization cost of instruction implies vectorization.");
6530   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6531   Type *ValTy = getMemInstValueType(I);
6532   auto SE = PSE.getSE();
6533 
6534   unsigned AS = getLoadStoreAddressSpace(I);
6535   Value *Ptr = getLoadStorePointerOperand(I);
6536   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6537 
6538   // Figure out whether the access is strided and get the stride value
6539   // if it's known in compile time
6540   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6541 
6542   // Get the cost of the scalar memory instruction and address computation.
6543   unsigned Cost =
6544       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6545 
6546   // Don't pass *I here, since it is scalar but will actually be part of a
6547   // vectorized loop where the user of it is a vectorized instruction.
6548   const Align Alignment = getLoadStoreAlignment(I);
6549   Cost += VF.getKnownMinValue() *
6550           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6551                               AS, TTI::TCK_RecipThroughput);
6552 
6553   // Get the overhead of the extractelement and insertelement instructions
6554   // we might create due to scalarization.
6555   Cost += getScalarizationOverhead(I, VF);
6556 
6557   // If we have a predicated store, it may not be executed for each vector
6558   // lane. Scale the cost by the probability of executing the predicated
6559   // block.
6560   if (isPredicatedInst(I)) {
6561     Cost /= getReciprocalPredBlockProb();
6562 
6563     if (useEmulatedMaskMemRefHack(I))
6564       // Artificially setting to a high enough value to practically disable
6565       // vectorization with such operations.
6566       Cost = 3000000;
6567   }
6568 
6569   return Cost;
6570 }
6571 
6572 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6573                                                              ElementCount VF) {
6574   Type *ValTy = getMemInstValueType(I);
6575   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6576   Value *Ptr = getLoadStorePointerOperand(I);
6577   unsigned AS = getLoadStoreAddressSpace(I);
6578   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6579   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6580 
6581   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6582          "Stride should be 1 or -1 for consecutive memory access");
6583   const Align Alignment = getLoadStoreAlignment(I);
6584   unsigned Cost = 0;
6585   if (Legal->isMaskRequired(I))
6586     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6587                                       CostKind);
6588   else
6589     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6590                                 CostKind, I);
6591 
6592   bool Reverse = ConsecutiveStride < 0;
6593   if (Reverse)
6594     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6595   return Cost;
6596 }
6597 
6598 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6599                                                          ElementCount VF) {
6600   assert(Legal->isUniformMemOp(*I));
6601 
6602   Type *ValTy = getMemInstValueType(I);
6603   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6604   const Align Alignment = getLoadStoreAlignment(I);
6605   unsigned AS = getLoadStoreAddressSpace(I);
6606   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6607   if (isa<LoadInst>(I)) {
6608     return TTI.getAddressComputationCost(ValTy) +
6609            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6610                                CostKind) +
6611            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6612   }
6613   StoreInst *SI = cast<StoreInst>(I);
6614 
6615   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6616   return TTI.getAddressComputationCost(ValTy) +
6617          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6618                              CostKind) +
6619          (isLoopInvariantStoreValue
6620               ? 0
6621               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6622                                        VF.getKnownMinValue() - 1));
6623 }
6624 
6625 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6626                                                           ElementCount VF) {
6627   Type *ValTy = getMemInstValueType(I);
6628   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6629   const Align Alignment = getLoadStoreAlignment(I);
6630   const Value *Ptr = getLoadStorePointerOperand(I);
6631 
6632   return TTI.getAddressComputationCost(VectorTy) +
6633          TTI.getGatherScatterOpCost(
6634              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6635              TargetTransformInfo::TCK_RecipThroughput, I);
6636 }
6637 
6638 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6639                                                             ElementCount VF) {
6640   Type *ValTy = getMemInstValueType(I);
6641   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6642   unsigned AS = getLoadStoreAddressSpace(I);
6643 
6644   auto Group = getInterleavedAccessGroup(I);
6645   assert(Group && "Fail to get an interleaved access group.");
6646 
6647   unsigned InterleaveFactor = Group->getFactor();
6648   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6649   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6650 
6651   // Holds the indices of existing members in an interleaved load group.
6652   // An interleaved store group doesn't need this as it doesn't allow gaps.
6653   SmallVector<unsigned, 4> Indices;
6654   if (isa<LoadInst>(I)) {
6655     for (unsigned i = 0; i < InterleaveFactor; i++)
6656       if (Group->getMember(i))
6657         Indices.push_back(i);
6658   }
6659 
6660   // Calculate the cost of the whole interleaved group.
6661   bool UseMaskForGaps =
6662       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6663   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6664       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6665       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6666 
6667   if (Group->isReverse()) {
6668     // TODO: Add support for reversed masked interleaved access.
6669     assert(!Legal->isMaskRequired(I) &&
6670            "Reverse masked interleaved access not supported.");
6671     Cost += Group->getNumMembers() *
6672             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6673   }
6674   return Cost;
6675 }
6676 
6677 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6678                                                               ElementCount VF) {
6679   // Calculate scalar cost only. Vectorization cost should be ready at this
6680   // moment.
6681   if (VF.isScalar()) {
6682     Type *ValTy = getMemInstValueType(I);
6683     const Align Alignment = getLoadStoreAlignment(I);
6684     unsigned AS = getLoadStoreAddressSpace(I);
6685 
6686     return TTI.getAddressComputationCost(ValTy) +
6687            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6688                                TTI::TCK_RecipThroughput, I);
6689   }
6690   return getWideningCost(I, VF);
6691 }
6692 
6693 LoopVectorizationCostModel::VectorizationCostTy
6694 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6695                                                ElementCount VF) {
6696   assert(!VF.isScalable() &&
6697          "the cost model is not yet implemented for scalable vectorization");
6698   // If we know that this instruction will remain uniform, check the cost of
6699   // the scalar version.
6700   if (isUniformAfterVectorization(I, VF))
6701     VF = ElementCount::getFixed(1);
6702 
6703   if (VF.isVector() && isProfitableToScalarize(I, VF))
6704     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6705 
6706   // Forced scalars do not have any scalarization overhead.
6707   auto ForcedScalar = ForcedScalars.find(VF);
6708   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6709     auto InstSet = ForcedScalar->second;
6710     if (InstSet.count(I))
6711       return VectorizationCostTy(
6712           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6713            VF.getKnownMinValue()),
6714           false);
6715   }
6716 
6717   Type *VectorTy;
6718   unsigned C = getInstructionCost(I, VF, VectorTy);
6719 
6720   bool TypeNotScalarized =
6721       VF.isVector() && VectorTy->isVectorTy() &&
6722       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6723   return VectorizationCostTy(C, TypeNotScalarized);
6724 }
6725 
6726 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6727                                                               ElementCount VF) {
6728 
6729   assert(!VF.isScalable() &&
6730          "cannot compute scalarization overhead for scalable vectorization");
6731   if (VF.isScalar())
6732     return 0;
6733 
6734   unsigned Cost = 0;
6735   Type *RetTy = ToVectorTy(I->getType(), VF);
6736   if (!RetTy->isVoidTy() &&
6737       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6738     Cost += TTI.getScalarizationOverhead(
6739         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6740         true, false);
6741 
6742   // Some targets keep addresses scalar.
6743   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6744     return Cost;
6745 
6746   // Some targets support efficient element stores.
6747   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6748     return Cost;
6749 
6750   // Collect operands to consider.
6751   CallInst *CI = dyn_cast<CallInst>(I);
6752   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6753 
6754   // Skip operands that do not require extraction/scalarization and do not incur
6755   // any overhead.
6756   return Cost + TTI.getOperandsScalarizationOverhead(
6757                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6758 }
6759 
6760 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6761   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6762   if (VF.isScalar())
6763     return;
6764   NumPredStores = 0;
6765   for (BasicBlock *BB : TheLoop->blocks()) {
6766     // For each instruction in the old loop.
6767     for (Instruction &I : *BB) {
6768       Value *Ptr =  getLoadStorePointerOperand(&I);
6769       if (!Ptr)
6770         continue;
6771 
6772       // TODO: We should generate better code and update the cost model for
6773       // predicated uniform stores. Today they are treated as any other
6774       // predicated store (see added test cases in
6775       // invariant-store-vectorization.ll).
6776       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6777         NumPredStores++;
6778 
6779       if (Legal->isUniformMemOp(I)) {
6780         // TODO: Avoid replicating loads and stores instead of
6781         // relying on instcombine to remove them.
6782         // Load: Scalar load + broadcast
6783         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6784         unsigned Cost = getUniformMemOpCost(&I, VF);
6785         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6786         continue;
6787       }
6788 
6789       // We assume that widening is the best solution when possible.
6790       if (memoryInstructionCanBeWidened(&I, VF)) {
6791         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6792         int ConsecutiveStride =
6793                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6794         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6795                "Expected consecutive stride.");
6796         InstWidening Decision =
6797             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6798         setWideningDecision(&I, VF, Decision, Cost);
6799         continue;
6800       }
6801 
6802       // Choose between Interleaving, Gather/Scatter or Scalarization.
6803       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6804       unsigned NumAccesses = 1;
6805       if (isAccessInterleaved(&I)) {
6806         auto Group = getInterleavedAccessGroup(&I);
6807         assert(Group && "Fail to get an interleaved access group.");
6808 
6809         // Make one decision for the whole group.
6810         if (getWideningDecision(&I, VF) != CM_Unknown)
6811           continue;
6812 
6813         NumAccesses = Group->getNumMembers();
6814         if (interleavedAccessCanBeWidened(&I, VF))
6815           InterleaveCost = getInterleaveGroupCost(&I, VF);
6816       }
6817 
6818       unsigned GatherScatterCost =
6819           isLegalGatherOrScatter(&I)
6820               ? getGatherScatterCost(&I, VF) * NumAccesses
6821               : std::numeric_limits<unsigned>::max();
6822 
6823       unsigned ScalarizationCost =
6824           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6825 
6826       // Choose better solution for the current VF,
6827       // write down this decision and use it during vectorization.
6828       unsigned Cost;
6829       InstWidening Decision;
6830       if (InterleaveCost <= GatherScatterCost &&
6831           InterleaveCost < ScalarizationCost) {
6832         Decision = CM_Interleave;
6833         Cost = InterleaveCost;
6834       } else if (GatherScatterCost < ScalarizationCost) {
6835         Decision = CM_GatherScatter;
6836         Cost = GatherScatterCost;
6837       } else {
6838         Decision = CM_Scalarize;
6839         Cost = ScalarizationCost;
6840       }
6841       // If the instructions belongs to an interleave group, the whole group
6842       // receives the same decision. The whole group receives the cost, but
6843       // the cost will actually be assigned to one instruction.
6844       if (auto Group = getInterleavedAccessGroup(&I))
6845         setWideningDecision(Group, VF, Decision, Cost);
6846       else
6847         setWideningDecision(&I, VF, Decision, Cost);
6848     }
6849   }
6850 
6851   // Make sure that any load of address and any other address computation
6852   // remains scalar unless there is gather/scatter support. This avoids
6853   // inevitable extracts into address registers, and also has the benefit of
6854   // activating LSR more, since that pass can't optimize vectorized
6855   // addresses.
6856   if (TTI.prefersVectorizedAddressing())
6857     return;
6858 
6859   // Start with all scalar pointer uses.
6860   SmallPtrSet<Instruction *, 8> AddrDefs;
6861   for (BasicBlock *BB : TheLoop->blocks())
6862     for (Instruction &I : *BB) {
6863       Instruction *PtrDef =
6864         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6865       if (PtrDef && TheLoop->contains(PtrDef) &&
6866           getWideningDecision(&I, VF) != CM_GatherScatter)
6867         AddrDefs.insert(PtrDef);
6868     }
6869 
6870   // Add all instructions used to generate the addresses.
6871   SmallVector<Instruction *, 4> Worklist;
6872   for (auto *I : AddrDefs)
6873     Worklist.push_back(I);
6874   while (!Worklist.empty()) {
6875     Instruction *I = Worklist.pop_back_val();
6876     for (auto &Op : I->operands())
6877       if (auto *InstOp = dyn_cast<Instruction>(Op))
6878         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6879             AddrDefs.insert(InstOp).second)
6880           Worklist.push_back(InstOp);
6881   }
6882 
6883   for (auto *I : AddrDefs) {
6884     if (isa<LoadInst>(I)) {
6885       // Setting the desired widening decision should ideally be handled in
6886       // by cost functions, but since this involves the task of finding out
6887       // if the loaded register is involved in an address computation, it is
6888       // instead changed here when we know this is the case.
6889       InstWidening Decision = getWideningDecision(I, VF);
6890       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6891         // Scalarize a widened load of address.
6892         setWideningDecision(
6893             I, VF, CM_Scalarize,
6894             (VF.getKnownMinValue() *
6895              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6896       else if (auto Group = getInterleavedAccessGroup(I)) {
6897         // Scalarize an interleave group of address loads.
6898         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6899           if (Instruction *Member = Group->getMember(I))
6900             setWideningDecision(
6901                 Member, VF, CM_Scalarize,
6902                 (VF.getKnownMinValue() *
6903                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6904         }
6905       }
6906     } else
6907       // Make sure I gets scalarized and a cost estimate without
6908       // scalarization overhead.
6909       ForcedScalars[VF].insert(I);
6910   }
6911 }
6912 
6913 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6914                                                         ElementCount VF,
6915                                                         Type *&VectorTy) {
6916   Type *RetTy = I->getType();
6917   if (canTruncateToMinimalBitwidth(I, VF))
6918     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6919   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6920   auto SE = PSE.getSE();
6921   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6922 
6923   // TODO: We need to estimate the cost of intrinsic calls.
6924   switch (I->getOpcode()) {
6925   case Instruction::GetElementPtr:
6926     // We mark this instruction as zero-cost because the cost of GEPs in
6927     // vectorized code depends on whether the corresponding memory instruction
6928     // is scalarized or not. Therefore, we handle GEPs with the memory
6929     // instruction cost.
6930     return 0;
6931   case Instruction::Br: {
6932     // In cases of scalarized and predicated instructions, there will be VF
6933     // predicated blocks in the vectorized loop. Each branch around these
6934     // blocks requires also an extract of its vector compare i1 element.
6935     bool ScalarPredicatedBB = false;
6936     BranchInst *BI = cast<BranchInst>(I);
6937     if (VF.isVector() && BI->isConditional() &&
6938         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6939          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6940       ScalarPredicatedBB = true;
6941 
6942     if (ScalarPredicatedBB) {
6943       // Return cost for branches around scalarized and predicated blocks.
6944       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6945       auto *Vec_i1Ty =
6946           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6947       return (TTI.getScalarizationOverhead(
6948                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6949                   false, true) +
6950               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
6951                VF.getKnownMinValue()));
6952     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6953       // The back-edge branch will remain, as will all scalar branches.
6954       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6955     else
6956       // This branch will be eliminated by if-conversion.
6957       return 0;
6958     // Note: We currently assume zero cost for an unconditional branch inside
6959     // a predicated block since it will become a fall-through, although we
6960     // may decide in the future to call TTI for all branches.
6961   }
6962   case Instruction::PHI: {
6963     auto *Phi = cast<PHINode>(I);
6964 
6965     // First-order recurrences are replaced by vector shuffles inside the loop.
6966     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6967     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6968       return TTI.getShuffleCost(
6969           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
6970           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
6971 
6972     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6973     // converted into select instructions. We require N - 1 selects per phi
6974     // node, where N is the number of incoming values.
6975     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6976       return (Phi->getNumIncomingValues() - 1) *
6977              TTI.getCmpSelInstrCost(
6978                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6979                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6980                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6981 
6982     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6983   }
6984   case Instruction::UDiv:
6985   case Instruction::SDiv:
6986   case Instruction::URem:
6987   case Instruction::SRem:
6988     // If we have a predicated instruction, it may not be executed for each
6989     // vector lane. Get the scalarization cost and scale this amount by the
6990     // probability of executing the predicated block. If the instruction is not
6991     // predicated, we fall through to the next case.
6992     if (VF.isVector() && isScalarWithPredication(I)) {
6993       unsigned Cost = 0;
6994 
6995       // These instructions have a non-void type, so account for the phi nodes
6996       // that we will create. This cost is likely to be zero. The phi node
6997       // cost, if any, should be scaled by the block probability because it
6998       // models a copy at the end of each predicated block.
6999       Cost += VF.getKnownMinValue() *
7000               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7001 
7002       // The cost of the non-predicated instruction.
7003       Cost += VF.getKnownMinValue() *
7004               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7005 
7006       // The cost of insertelement and extractelement instructions needed for
7007       // scalarization.
7008       Cost += getScalarizationOverhead(I, VF);
7009 
7010       // Scale the cost by the probability of executing the predicated blocks.
7011       // This assumes the predicated block for each vector lane is equally
7012       // likely.
7013       return Cost / getReciprocalPredBlockProb();
7014     }
7015     LLVM_FALLTHROUGH;
7016   case Instruction::Add:
7017   case Instruction::FAdd:
7018   case Instruction::Sub:
7019   case Instruction::FSub:
7020   case Instruction::Mul:
7021   case Instruction::FMul:
7022   case Instruction::FDiv:
7023   case Instruction::FRem:
7024   case Instruction::Shl:
7025   case Instruction::LShr:
7026   case Instruction::AShr:
7027   case Instruction::And:
7028   case Instruction::Or:
7029   case Instruction::Xor: {
7030     // Since we will replace the stride by 1 the multiplication should go away.
7031     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7032       return 0;
7033     // Certain instructions can be cheaper to vectorize if they have a constant
7034     // second vector operand. One example of this are shifts on x86.
7035     Value *Op2 = I->getOperand(1);
7036     TargetTransformInfo::OperandValueProperties Op2VP;
7037     TargetTransformInfo::OperandValueKind Op2VK =
7038         TTI.getOperandInfo(Op2, Op2VP);
7039     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7040       Op2VK = TargetTransformInfo::OK_UniformValue;
7041 
7042     SmallVector<const Value *, 4> Operands(I->operand_values());
7043     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7044     return N * TTI.getArithmeticInstrCost(
7045                    I->getOpcode(), VectorTy, CostKind,
7046                    TargetTransformInfo::OK_AnyValue,
7047                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7048   }
7049   case Instruction::FNeg: {
7050     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7051     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7052     return N * TTI.getArithmeticInstrCost(
7053                    I->getOpcode(), VectorTy, CostKind,
7054                    TargetTransformInfo::OK_AnyValue,
7055                    TargetTransformInfo::OK_AnyValue,
7056                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
7057                    I->getOperand(0), I);
7058   }
7059   case Instruction::Select: {
7060     SelectInst *SI = cast<SelectInst>(I);
7061     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7062     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7063     Type *CondTy = SI->getCondition()->getType();
7064     if (!ScalarCond) {
7065       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7066       CondTy = VectorType::get(CondTy, VF);
7067     }
7068     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7069                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7070   }
7071   case Instruction::ICmp:
7072   case Instruction::FCmp: {
7073     Type *ValTy = I->getOperand(0)->getType();
7074     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7075     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7076       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7077     VectorTy = ToVectorTy(ValTy, VF);
7078     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7079                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7080   }
7081   case Instruction::Store:
7082   case Instruction::Load: {
7083     ElementCount Width = VF;
7084     if (Width.isVector()) {
7085       InstWidening Decision = getWideningDecision(I, Width);
7086       assert(Decision != CM_Unknown &&
7087              "CM decision should be taken at this point");
7088       if (Decision == CM_Scalarize)
7089         Width = ElementCount::getFixed(1);
7090     }
7091     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7092     return getMemoryInstructionCost(I, VF);
7093   }
7094   case Instruction::ZExt:
7095   case Instruction::SExt:
7096   case Instruction::FPToUI:
7097   case Instruction::FPToSI:
7098   case Instruction::FPExt:
7099   case Instruction::PtrToInt:
7100   case Instruction::IntToPtr:
7101   case Instruction::SIToFP:
7102   case Instruction::UIToFP:
7103   case Instruction::Trunc:
7104   case Instruction::FPTrunc:
7105   case Instruction::BitCast: {
7106     // Computes the CastContextHint from a Load/Store instruction.
7107     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7108       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7109              "Expected a load or a store!");
7110 
7111       if (VF.isScalar() || !TheLoop->contains(I))
7112         return TTI::CastContextHint::Normal;
7113 
7114       switch (getWideningDecision(I, VF)) {
7115       case LoopVectorizationCostModel::CM_GatherScatter:
7116         return TTI::CastContextHint::GatherScatter;
7117       case LoopVectorizationCostModel::CM_Interleave:
7118         return TTI::CastContextHint::Interleave;
7119       case LoopVectorizationCostModel::CM_Scalarize:
7120       case LoopVectorizationCostModel::CM_Widen:
7121         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7122                                         : TTI::CastContextHint::Normal;
7123       case LoopVectorizationCostModel::CM_Widen_Reverse:
7124         return TTI::CastContextHint::Reversed;
7125       case LoopVectorizationCostModel::CM_Unknown:
7126         llvm_unreachable("Instr did not go through cost modelling?");
7127       }
7128 
7129       llvm_unreachable("Unhandled case!");
7130     };
7131 
7132     unsigned Opcode = I->getOpcode();
7133     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7134     // For Trunc, the context is the only user, which must be a StoreInst.
7135     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7136       if (I->hasOneUse())
7137         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7138           CCH = ComputeCCH(Store);
7139     }
7140     // For Z/Sext, the context is the operand, which must be a LoadInst.
7141     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7142              Opcode == Instruction::FPExt) {
7143       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7144         CCH = ComputeCCH(Load);
7145     }
7146 
7147     // We optimize the truncation of induction variables having constant
7148     // integer steps. The cost of these truncations is the same as the scalar
7149     // operation.
7150     if (isOptimizableIVTruncate(I, VF)) {
7151       auto *Trunc = cast<TruncInst>(I);
7152       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7153                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7154     }
7155 
7156     Type *SrcScalarTy = I->getOperand(0)->getType();
7157     Type *SrcVecTy =
7158         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7159     if (canTruncateToMinimalBitwidth(I, VF)) {
7160       // This cast is going to be shrunk. This may remove the cast or it might
7161       // turn it into slightly different cast. For example, if MinBW == 16,
7162       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7163       //
7164       // Calculate the modified src and dest types.
7165       Type *MinVecTy = VectorTy;
7166       if (Opcode == Instruction::Trunc) {
7167         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7168         VectorTy =
7169             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7170       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7171         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7172         VectorTy =
7173             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7174       }
7175     }
7176 
7177     assert(!VF.isScalable() && "VF is assumed to be non scalable");
7178     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7179     return N *
7180            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7181   }
7182   case Instruction::Call: {
7183     bool NeedToScalarize;
7184     CallInst *CI = cast<CallInst>(I);
7185     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7186     if (getVectorIntrinsicIDForCall(CI, TLI))
7187       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
7188     return CallCost;
7189   }
7190   case Instruction::ExtractValue:
7191     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7192   default:
7193     // The cost of executing VF copies of the scalar instruction. This opcode
7194     // is unknown. Assume that it is the same as 'mul'.
7195     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
7196                                        Instruction::Mul, VectorTy, CostKind) +
7197            getScalarizationOverhead(I, VF);
7198   } // end of switch.
7199 }
7200 
7201 char LoopVectorize::ID = 0;
7202 
7203 static const char lv_name[] = "Loop Vectorization";
7204 
7205 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7206 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7207 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7208 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7209 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7210 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7211 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7212 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7213 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7214 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7215 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7216 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7217 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7218 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7219 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7220 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7221 
7222 namespace llvm {
7223 
7224 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7225 
7226 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7227                               bool VectorizeOnlyWhenForced) {
7228   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7229 }
7230 
7231 } // end namespace llvm
7232 
7233 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7234   // Check if the pointer operand of a load or store instruction is
7235   // consecutive.
7236   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7237     return Legal->isConsecutivePtr(Ptr);
7238   return false;
7239 }
7240 
7241 void LoopVectorizationCostModel::collectValuesToIgnore() {
7242   // Ignore ephemeral values.
7243   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7244 
7245   // Ignore type-promoting instructions we identified during reduction
7246   // detection.
7247   for (auto &Reduction : Legal->getReductionVars()) {
7248     RecurrenceDescriptor &RedDes = Reduction.second;
7249     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7250     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7251   }
7252   // Ignore type-casting instructions we identified during induction
7253   // detection.
7254   for (auto &Induction : Legal->getInductionVars()) {
7255     InductionDescriptor &IndDes = Induction.second;
7256     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7257     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7258   }
7259 }
7260 
7261 void LoopVectorizationCostModel::collectInLoopReductions() {
7262   for (auto &Reduction : Legal->getReductionVars()) {
7263     PHINode *Phi = Reduction.first;
7264     RecurrenceDescriptor &RdxDesc = Reduction.second;
7265 
7266     // We don't collect reductions that are type promoted (yet).
7267     if (RdxDesc.getRecurrenceType() != Phi->getType())
7268       continue;
7269 
7270     // If the target would prefer this reduction to happen "in-loop", then we
7271     // want to record it as such.
7272     unsigned Opcode = RdxDesc.getRecurrenceBinOp();
7273     if (!PreferInLoopReductions &&
7274         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7275                                    TargetTransformInfo::ReductionFlags()))
7276       continue;
7277 
7278     // Check that we can correctly put the reductions into the loop, by
7279     // finding the chain of operations that leads from the phi to the loop
7280     // exit value.
7281     SmallVector<Instruction *, 4> ReductionOperations =
7282         RdxDesc.getReductionOpChain(Phi, TheLoop);
7283     bool InLoop = !ReductionOperations.empty();
7284     if (InLoop)
7285       InLoopReductionChains[Phi] = ReductionOperations;
7286     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7287                       << " reduction for phi: " << *Phi << "\n");
7288   }
7289 }
7290 
7291 // TODO: we could return a pair of values that specify the max VF and
7292 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7293 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7294 // doesn't have a cost model that can choose which plan to execute if
7295 // more than one is generated.
7296 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7297                                  LoopVectorizationCostModel &CM) {
7298   unsigned WidestType;
7299   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7300   return WidestVectorRegBits / WidestType;
7301 }
7302 
7303 VectorizationFactor
7304 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7305   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7306   ElementCount VF = UserVF;
7307   // Outer loop handling: They may require CFG and instruction level
7308   // transformations before even evaluating whether vectorization is profitable.
7309   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7310   // the vectorization pipeline.
7311   if (!OrigLoop->isInnermost()) {
7312     // If the user doesn't provide a vectorization factor, determine a
7313     // reasonable one.
7314     if (UserVF.isZero()) {
7315       VF = ElementCount::getFixed(
7316           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
7317       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7318 
7319       // Make sure we have a VF > 1 for stress testing.
7320       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7321         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7322                           << "overriding computed VF.\n");
7323         VF = ElementCount::getFixed(4);
7324       }
7325     }
7326     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7327     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7328            "VF needs to be a power of two");
7329     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7330                       << "VF " << VF << " to build VPlans.\n");
7331     buildVPlans(VF, VF);
7332 
7333     // For VPlan build stress testing, we bail out after VPlan construction.
7334     if (VPlanBuildStressTest)
7335       return VectorizationFactor::Disabled();
7336 
7337     return {VF, 0 /*Cost*/};
7338   }
7339 
7340   LLVM_DEBUG(
7341       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7342                 "VPlan-native path.\n");
7343   return VectorizationFactor::Disabled();
7344 }
7345 
7346 Optional<VectorizationFactor>
7347 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7348   assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
7349   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7350   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7351   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7352     return None;
7353 
7354   // Invalidate interleave groups if all blocks of loop will be predicated.
7355   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7356       !useMaskedInterleavedAccesses(*TTI)) {
7357     LLVM_DEBUG(
7358         dbgs()
7359         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7360            "which requires masked-interleaved support.\n");
7361     if (CM.InterleaveInfo.invalidateGroups())
7362       // Invalidating interleave groups also requires invalidating all decisions
7363       // based on them, which includes widening decisions and uniform and scalar
7364       // values.
7365       CM.invalidateCostModelingDecisions();
7366   }
7367 
7368   ElementCount MaxVF = MaybeMaxVF.getValue();
7369   assert(MaxVF.isNonZero() && "MaxVF is zero.");
7370 
7371   if (!UserVF.isZero() && UserVF.getFixedValue() <= MaxVF.getFixedValue()) {
7372     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7373     assert(isPowerOf2_32(UserVF.getFixedValue()) &&
7374            "VF needs to be a power of two");
7375     // Collect the instructions (and their associated costs) that will be more
7376     // profitable to scalarize.
7377     CM.selectUserVectorizationFactor(UserVF);
7378     CM.collectInLoopReductions();
7379     buildVPlansWithVPRecipes(UserVF, UserVF);
7380     LLVM_DEBUG(printPlans(dbgs()));
7381     return {{UserVF, 0}};
7382   }
7383 
7384   for (ElementCount VF = ElementCount::getFixed(1);
7385        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7386     // Collect Uniform and Scalar instructions after vectorization with VF.
7387     CM.collectUniformsAndScalars(VF);
7388 
7389     // Collect the instructions (and their associated costs) that will be more
7390     // profitable to scalarize.
7391     if (VF.isVector())
7392       CM.collectInstsToScalarize(VF);
7393   }
7394 
7395   CM.collectInLoopReductions();
7396 
7397   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7398   LLVM_DEBUG(printPlans(dbgs()));
7399   if (MaxVF.isScalar())
7400     return VectorizationFactor::Disabled();
7401 
7402   // Select the optimal vectorization factor.
7403   return CM.selectVectorizationFactor(MaxVF);
7404 }
7405 
7406 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7407   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7408                     << '\n');
7409   BestVF = VF;
7410   BestUF = UF;
7411 
7412   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7413     return !Plan->hasVF(VF);
7414   });
7415   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7416 }
7417 
7418 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7419                                            DominatorTree *DT) {
7420   // Perform the actual loop transformation.
7421 
7422   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7423   VPCallbackILV CallbackILV(ILV);
7424 
7425   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7426 
7427   VPTransformState State{*BestVF, BestUF,      LI,
7428                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7429                          &ILV,    CallbackILV};
7430   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7431   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7432   State.CanonicalIV = ILV.Induction;
7433 
7434   ILV.printDebugTracesAtStart();
7435 
7436   //===------------------------------------------------===//
7437   //
7438   // Notice: any optimization or new instruction that go
7439   // into the code below should also be implemented in
7440   // the cost-model.
7441   //
7442   //===------------------------------------------------===//
7443 
7444   // 2. Copy and widen instructions from the old loop into the new loop.
7445   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7446   VPlans.front()->execute(&State);
7447 
7448   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7449   //    predication, updating analyses.
7450   ILV.fixVectorizedLoop();
7451 
7452   ILV.printDebugTracesAtEnd();
7453 }
7454 
7455 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7456     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7457   BasicBlock *Latch = OrigLoop->getLoopLatch();
7458 
7459   // We create new control-flow for the vectorized loop, so the original
7460   // condition will be dead after vectorization if it's only used by the
7461   // branch.
7462   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
7463   if (Cmp && Cmp->hasOneUse()) {
7464     DeadInstructions.insert(Cmp);
7465 
7466     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7467     for (Value *Op : Cmp->operands()) {
7468       if (isa<TruncInst>(Op) && Op->hasOneUse())
7469           DeadInstructions.insert(cast<Instruction>(Op));
7470     }
7471   }
7472 
7473   // We create new "steps" for induction variable updates to which the original
7474   // induction variables map. An original update instruction will be dead if
7475   // all its users except the induction variable are dead.
7476   for (auto &Induction : Legal->getInductionVars()) {
7477     PHINode *Ind = Induction.first;
7478     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7479 
7480     // If the tail is to be folded by masking, the primary induction variable,
7481     // if exists, isn't dead: it will be used for masking. Don't kill it.
7482     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7483       continue;
7484 
7485     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7486           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7487         }))
7488       DeadInstructions.insert(IndUpdate);
7489 
7490     // We record as "Dead" also the type-casting instructions we had identified
7491     // during induction analysis. We don't need any handling for them in the
7492     // vectorized loop because we have proven that, under a proper runtime
7493     // test guarding the vectorized loop, the value of the phi, and the casted
7494     // value of the phi, are the same. The last instruction in this casting chain
7495     // will get its scalar/vector/widened def from the scalar/vector/widened def
7496     // of the respective phi node. Any other casts in the induction def-use chain
7497     // have no other uses outside the phi update chain, and will be ignored.
7498     InductionDescriptor &IndDes = Induction.second;
7499     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7500     DeadInstructions.insert(Casts.begin(), Casts.end());
7501   }
7502 }
7503 
7504 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7505 
7506 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7507 
7508 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7509                                         Instruction::BinaryOps BinOp) {
7510   // When unrolling and the VF is 1, we only need to add a simple scalar.
7511   Type *Ty = Val->getType();
7512   assert(!Ty->isVectorTy() && "Val must be a scalar");
7513 
7514   if (Ty->isFloatingPointTy()) {
7515     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7516 
7517     // Floating point operations had to be 'fast' to enable the unrolling.
7518     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7519     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7520   }
7521   Constant *C = ConstantInt::get(Ty, StartIdx);
7522   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7523 }
7524 
7525 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7526   SmallVector<Metadata *, 4> MDs;
7527   // Reserve first location for self reference to the LoopID metadata node.
7528   MDs.push_back(nullptr);
7529   bool IsUnrollMetadata = false;
7530   MDNode *LoopID = L->getLoopID();
7531   if (LoopID) {
7532     // First find existing loop unrolling disable metadata.
7533     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7534       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7535       if (MD) {
7536         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7537         IsUnrollMetadata =
7538             S && S->getString().startswith("llvm.loop.unroll.disable");
7539       }
7540       MDs.push_back(LoopID->getOperand(i));
7541     }
7542   }
7543 
7544   if (!IsUnrollMetadata) {
7545     // Add runtime unroll disable metadata.
7546     LLVMContext &Context = L->getHeader()->getContext();
7547     SmallVector<Metadata *, 1> DisableOperands;
7548     DisableOperands.push_back(
7549         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7550     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7551     MDs.push_back(DisableNode);
7552     MDNode *NewLoopID = MDNode::get(Context, MDs);
7553     // Set operand 0 to refer to the loop id itself.
7554     NewLoopID->replaceOperandWith(0, NewLoopID);
7555     L->setLoopID(NewLoopID);
7556   }
7557 }
7558 
7559 //===--------------------------------------------------------------------===//
7560 // EpilogueVectorizerMainLoop
7561 //===--------------------------------------------------------------------===//
7562 
7563 /// This function is partially responsible for generating the control flow
7564 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7565 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7566   MDNode *OrigLoopID = OrigLoop->getLoopID();
7567   Loop *Lp = createVectorLoopSkeleton("");
7568 
7569   // Generate the code to check the minimum iteration count of the vector
7570   // epilogue (see below).
7571   EPI.EpilogueIterationCountCheck =
7572       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
7573   EPI.EpilogueIterationCountCheck->setName("iter.check");
7574 
7575   // Generate the code to check any assumptions that we've made for SCEV
7576   // expressions.
7577   BasicBlock *SavedPreHeader = LoopVectorPreHeader;
7578   emitSCEVChecks(Lp, LoopScalarPreHeader);
7579 
7580   // If a safety check was generated save it.
7581   if (SavedPreHeader != LoopVectorPreHeader)
7582     EPI.SCEVSafetyCheck = SavedPreHeader;
7583 
7584   // Generate the code that checks at runtime if arrays overlap. We put the
7585   // checks into a separate block to make the more common case of few elements
7586   // faster.
7587   SavedPreHeader = LoopVectorPreHeader;
7588   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
7589 
7590   // If a safety check was generated save/overwite it.
7591   if (SavedPreHeader != LoopVectorPreHeader)
7592     EPI.MemSafetyCheck = SavedPreHeader;
7593 
7594   // Generate the iteration count check for the main loop, *after* the check
7595   // for the epilogue loop, so that the path-length is shorter for the case
7596   // that goes directly through the vector epilogue. The longer-path length for
7597   // the main loop is compensated for, by the gain from vectorizing the larger
7598   // trip count. Note: the branch will get updated later on when we vectorize
7599   // the epilogue.
7600   EPI.MainLoopIterationCountCheck =
7601       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
7602 
7603   // Generate the induction variable.
7604   OldInduction = Legal->getPrimaryInduction();
7605   Type *IdxTy = Legal->getWidestInductionType();
7606   Value *StartIdx = ConstantInt::get(IdxTy, 0);
7607   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7608   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7609   EPI.VectorTripCount = CountRoundDown;
7610   Induction =
7611       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7612                               getDebugLocFromInstOrOperands(OldInduction));
7613 
7614   // Skip induction resume value creation here because they will be created in
7615   // the second pass. If we created them here, they wouldn't be used anyway,
7616   // because the vplan in the second pass still contains the inductions from the
7617   // original loop.
7618 
7619   return completeLoopSkeleton(Lp, OrigLoopID);
7620 }
7621 
7622 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7623   LLVM_DEBUG({
7624     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7625            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7626            << ", Main Loop UF:" << EPI.MainLoopUF
7627            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7628            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7629   });
7630 }
7631 
7632 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7633   DEBUG_WITH_TYPE(VerboseDebug, {
7634     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
7635   });
7636 }
7637 
7638 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
7639     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
7640   assert(L && "Expected valid Loop.");
7641   assert(Bypass && "Expected valid bypass basic block.");
7642   unsigned VFactor =
7643       ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
7644   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7645   Value *Count = getOrCreateTripCount(L);
7646   // Reuse existing vector loop preheader for TC checks.
7647   // Note that new preheader block is generated for vector loop.
7648   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7649   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7650 
7651   // Generate code to check if the loop's trip count is less than VF * UF of the
7652   // main vector loop.
7653   auto P =
7654       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7655 
7656   Value *CheckMinIters = Builder.CreateICmp(
7657       P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
7658       "min.iters.check");
7659 
7660   if (!ForEpilogue)
7661     TCCheckBlock->setName("vector.main.loop.iter.check");
7662 
7663   // Create new preheader for vector loop.
7664   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7665                                    DT, LI, nullptr, "vector.ph");
7666 
7667   if (ForEpilogue) {
7668     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7669                                  DT->getNode(Bypass)->getIDom()) &&
7670            "TC check is expected to dominate Bypass");
7671 
7672     // Update dominator for Bypass & LoopExit.
7673     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7674     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7675 
7676     LoopBypassBlocks.push_back(TCCheckBlock);
7677 
7678     // Save the trip count so we don't have to regenerate it in the
7679     // vec.epilog.iter.check. This is safe to do because the trip count
7680     // generated here dominates the vector epilog iter check.
7681     EPI.TripCount = Count;
7682   }
7683 
7684   ReplaceInstWithInst(
7685       TCCheckBlock->getTerminator(),
7686       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7687 
7688   return TCCheckBlock;
7689 }
7690 
7691 //===--------------------------------------------------------------------===//
7692 // EpilogueVectorizerEpilogueLoop
7693 //===--------------------------------------------------------------------===//
7694 
7695 /// This function is partially responsible for generating the control flow
7696 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7697 BasicBlock *
7698 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7699   MDNode *OrigLoopID = OrigLoop->getLoopID();
7700   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
7701 
7702   // Now, compare the remaining count and if there aren't enough iterations to
7703   // execute the vectorized epilogue skip to the scalar part.
7704   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7705   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7706   LoopVectorPreHeader =
7707       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7708                  LI, nullptr, "vec.epilog.ph");
7709   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
7710                                           VecEpilogueIterationCountCheck);
7711 
7712   // Adjust the control flow taking the state info from the main loop
7713   // vectorization into account.
7714   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7715          "expected this to be saved from the previous pass.");
7716   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7717       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7718 
7719   DT->changeImmediateDominator(LoopVectorPreHeader,
7720                                EPI.MainLoopIterationCountCheck);
7721 
7722   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7723       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7724 
7725   if (EPI.SCEVSafetyCheck)
7726     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7727         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7728   if (EPI.MemSafetyCheck)
7729     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7730         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7731 
7732   DT->changeImmediateDominator(
7733       VecEpilogueIterationCountCheck,
7734       VecEpilogueIterationCountCheck->getSinglePredecessor());
7735 
7736   DT->changeImmediateDominator(LoopScalarPreHeader,
7737                                EPI.EpilogueIterationCountCheck);
7738   DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
7739 
7740   // Keep track of bypass blocks, as they feed start values to the induction
7741   // phis in the scalar loop preheader.
7742   if (EPI.SCEVSafetyCheck)
7743     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7744   if (EPI.MemSafetyCheck)
7745     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7746   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7747 
7748   // Generate a resume induction for the vector epilogue and put it in the
7749   // vector epilogue preheader
7750   Type *IdxTy = Legal->getWidestInductionType();
7751   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7752                                          LoopVectorPreHeader->getFirstNonPHI());
7753   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7754   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7755                            EPI.MainLoopIterationCountCheck);
7756 
7757   // Generate the induction variable.
7758   OldInduction = Legal->getPrimaryInduction();
7759   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7760   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7761   Value *StartIdx = EPResumeVal;
7762   Induction =
7763       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7764                               getDebugLocFromInstOrOperands(OldInduction));
7765 
7766   // Generate induction resume values. These variables save the new starting
7767   // indexes for the scalar loop. They are used to test if there are any tail
7768   // iterations left once the vector loop has completed.
7769   // Note that when the vectorized epilogue is skipped due to iteration count
7770   // check, then the resume value for the induction variable comes from
7771   // the trip count of the main vector loop, hence passing the AdditionalBypass
7772   // argument.
7773   createInductionResumeValues(Lp, CountRoundDown,
7774                               {VecEpilogueIterationCountCheck,
7775                                EPI.VectorTripCount} /* AdditionalBypass */);
7776 
7777   AddRuntimeUnrollDisableMetaData(Lp);
7778   return completeLoopSkeleton(Lp, OrigLoopID);
7779 }
7780 
7781 BasicBlock *
7782 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7783     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
7784 
7785   assert(EPI.TripCount &&
7786          "Expected trip count to have been safed in the first pass.");
7787   assert(
7788       (!isa<Instruction>(EPI.TripCount) ||
7789        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7790       "saved trip count does not dominate insertion point.");
7791   Value *TC = EPI.TripCount;
7792   IRBuilder<> Builder(Insert->getTerminator());
7793   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7794 
7795   // Generate code to check if the loop's trip count is less than VF * UF of the
7796   // vector epilogue loop.
7797   auto P =
7798       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7799 
7800   Value *CheckMinIters = Builder.CreateICmp(
7801       P, Count,
7802       ConstantInt::get(Count->getType(),
7803                        EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
7804       "min.epilog.iters.check");
7805 
7806   ReplaceInstWithInst(
7807       Insert->getTerminator(),
7808       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7809 
7810   LoopBypassBlocks.push_back(Insert);
7811   return Insert;
7812 }
7813 
7814 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7815   LLVM_DEBUG({
7816     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7817            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7818            << ", Main Loop UF:" << EPI.MainLoopUF
7819            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7820            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7821   });
7822 }
7823 
7824 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7825   DEBUG_WITH_TYPE(VerboseDebug, {
7826     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
7827   });
7828 }
7829 
7830 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7831     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7832   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7833   bool PredicateAtRangeStart = Predicate(Range.Start);
7834 
7835   for (ElementCount TmpVF = Range.Start * 2;
7836        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7837     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7838       Range.End = TmpVF;
7839       break;
7840     }
7841 
7842   return PredicateAtRangeStart;
7843 }
7844 
7845 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7846 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7847 /// of VF's starting at a given VF and extending it as much as possible. Each
7848 /// vectorization decision can potentially shorten this sub-range during
7849 /// buildVPlan().
7850 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7851                                            ElementCount MaxVF) {
7852   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7853   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7854     VFRange SubRange = {VF, MaxVFPlusOne};
7855     VPlans.push_back(buildVPlan(SubRange));
7856     VF = SubRange.End;
7857   }
7858 }
7859 
7860 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7861                                          VPlanPtr &Plan) {
7862   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7863 
7864   // Look for cached value.
7865   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7866   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7867   if (ECEntryIt != EdgeMaskCache.end())
7868     return ECEntryIt->second;
7869 
7870   VPValue *SrcMask = createBlockInMask(Src, Plan);
7871 
7872   // The terminator has to be a branch inst!
7873   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7874   assert(BI && "Unexpected terminator found");
7875 
7876   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7877     return EdgeMaskCache[Edge] = SrcMask;
7878 
7879   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
7880   assert(EdgeMask && "No Edge Mask found for condition");
7881 
7882   if (BI->getSuccessor(0) != Dst)
7883     EdgeMask = Builder.createNot(EdgeMask);
7884 
7885   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7886     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7887 
7888   return EdgeMaskCache[Edge] = EdgeMask;
7889 }
7890 
7891 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7892   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7893 
7894   // Look for cached value.
7895   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7896   if (BCEntryIt != BlockMaskCache.end())
7897     return BCEntryIt->second;
7898 
7899   // All-one mask is modelled as no-mask following the convention for masked
7900   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7901   VPValue *BlockMask = nullptr;
7902 
7903   if (OrigLoop->getHeader() == BB) {
7904     if (!CM.blockNeedsPredication(BB))
7905       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7906 
7907     // Create the block in mask as the first non-phi instruction in the block.
7908     VPBuilder::InsertPointGuard Guard(Builder);
7909     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
7910     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
7911 
7912     // Introduce the early-exit compare IV <= BTC to form header block mask.
7913     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7914     // Start by constructing the desired canonical IV.
7915     VPValue *IV = nullptr;
7916     if (Legal->getPrimaryInduction())
7917       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
7918     else {
7919       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7920       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
7921       IV = IVRecipe->getVPValue();
7922     }
7923     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7924     bool TailFolded = !CM.isScalarEpilogueAllowed();
7925 
7926     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7927       // While ActiveLaneMask is a binary op that consumes the loop tripcount
7928       // as a second argument, we only pass the IV here and extract the
7929       // tripcount from the transform state where codegen of the VP instructions
7930       // happen.
7931       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7932     } else {
7933       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7934     }
7935     return BlockMaskCache[BB] = BlockMask;
7936   }
7937 
7938   // This is the block mask. We OR all incoming edges.
7939   for (auto *Predecessor : predecessors(BB)) {
7940     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7941     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7942       return BlockMaskCache[BB] = EdgeMask;
7943 
7944     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7945       BlockMask = EdgeMask;
7946       continue;
7947     }
7948 
7949     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7950   }
7951 
7952   return BlockMaskCache[BB] = BlockMask;
7953 }
7954 
7955 VPWidenMemoryInstructionRecipe *
7956 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7957                                   VPlanPtr &Plan) {
7958   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7959          "Must be called with either a load or store");
7960 
7961   auto willWiden = [&](ElementCount VF) -> bool {
7962     assert(!VF.isScalable() && "unexpected scalable ElementCount");
7963     if (VF.isScalar())
7964       return false;
7965     LoopVectorizationCostModel::InstWidening Decision =
7966         CM.getWideningDecision(I, VF);
7967     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7968            "CM decision should be taken at this point.");
7969     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7970       return true;
7971     if (CM.isScalarAfterVectorization(I, VF) ||
7972         CM.isProfitableToScalarize(I, VF))
7973       return false;
7974     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7975   };
7976 
7977   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7978     return nullptr;
7979 
7980   VPValue *Mask = nullptr;
7981   if (Legal->isMaskRequired(I))
7982     Mask = createBlockInMask(I->getParent(), Plan);
7983 
7984   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7985   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7986     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7987 
7988   StoreInst *Store = cast<StoreInst>(I);
7989   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7990   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7991 }
7992 
7993 VPWidenIntOrFpInductionRecipe *
7994 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
7995   // Check if this is an integer or fp induction. If so, build the recipe that
7996   // produces its scalar and vector values.
7997   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
7998   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
7999       II.getKind() == InductionDescriptor::IK_FpInduction)
8000     return new VPWidenIntOrFpInductionRecipe(Phi);
8001 
8002   return nullptr;
8003 }
8004 
8005 VPWidenIntOrFpInductionRecipe *
8006 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
8007                                                 VFRange &Range) const {
8008   // Optimize the special case where the source is a constant integer
8009   // induction variable. Notice that we can only optimize the 'trunc' case
8010   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8011   // (c) other casts depend on pointer size.
8012 
8013   // Determine whether \p K is a truncation based on an induction variable that
8014   // can be optimized.
8015   auto isOptimizableIVTruncate =
8016       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8017     return [=](ElementCount VF) -> bool {
8018       return CM.isOptimizableIVTruncate(K, VF);
8019     };
8020   };
8021 
8022   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8023           isOptimizableIVTruncate(I), Range))
8024     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8025                                              I);
8026   return nullptr;
8027 }
8028 
8029 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
8030   // We know that all PHIs in non-header blocks are converted into selects, so
8031   // we don't have to worry about the insertion order and we can just use the
8032   // builder. At this point we generate the predication tree. There may be
8033   // duplications since this is a simple recursive scan, but future
8034   // optimizations will clean it up.
8035 
8036   SmallVector<VPValue *, 2> Operands;
8037   unsigned NumIncoming = Phi->getNumIncomingValues();
8038   for (unsigned In = 0; In < NumIncoming; In++) {
8039     VPValue *EdgeMask =
8040       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8041     assert((EdgeMask || NumIncoming == 1) &&
8042            "Multiple predecessors with one having a full mask");
8043     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
8044     if (EdgeMask)
8045       Operands.push_back(EdgeMask);
8046   }
8047   return new VPBlendRecipe(Phi, Operands);
8048 }
8049 
8050 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
8051                                                    VPlan &Plan) const {
8052 
8053   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8054       [this, CI](ElementCount VF) {
8055         return CM.isScalarWithPredication(CI, VF);
8056       },
8057       Range);
8058 
8059   if (IsPredicated)
8060     return nullptr;
8061 
8062   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8063   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8064              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8065              ID == Intrinsic::pseudoprobe))
8066     return nullptr;
8067 
8068   auto willWiden = [&](ElementCount VF) -> bool {
8069     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8070     // The following case may be scalarized depending on the VF.
8071     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8072     // version of the instruction.
8073     // Is it beneficial to perform intrinsic call compared to lib call?
8074     bool NeedToScalarize = false;
8075     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8076     bool UseVectorIntrinsic =
8077         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
8078     return UseVectorIntrinsic || !NeedToScalarize;
8079   };
8080 
8081   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8082     return nullptr;
8083 
8084   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
8085 }
8086 
8087 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8088   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8089          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8090   // Instruction should be widened, unless it is scalar after vectorization,
8091   // scalarization is profitable or it is predicated.
8092   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8093     return CM.isScalarAfterVectorization(I, VF) ||
8094            CM.isProfitableToScalarize(I, VF) ||
8095            CM.isScalarWithPredication(I, VF);
8096   };
8097   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8098                                                              Range);
8099 }
8100 
8101 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
8102   auto IsVectorizableOpcode = [](unsigned Opcode) {
8103     switch (Opcode) {
8104     case Instruction::Add:
8105     case Instruction::And:
8106     case Instruction::AShr:
8107     case Instruction::BitCast:
8108     case Instruction::FAdd:
8109     case Instruction::FCmp:
8110     case Instruction::FDiv:
8111     case Instruction::FMul:
8112     case Instruction::FNeg:
8113     case Instruction::FPExt:
8114     case Instruction::FPToSI:
8115     case Instruction::FPToUI:
8116     case Instruction::FPTrunc:
8117     case Instruction::FRem:
8118     case Instruction::FSub:
8119     case Instruction::ICmp:
8120     case Instruction::IntToPtr:
8121     case Instruction::LShr:
8122     case Instruction::Mul:
8123     case Instruction::Or:
8124     case Instruction::PtrToInt:
8125     case Instruction::SDiv:
8126     case Instruction::Select:
8127     case Instruction::SExt:
8128     case Instruction::Shl:
8129     case Instruction::SIToFP:
8130     case Instruction::SRem:
8131     case Instruction::Sub:
8132     case Instruction::Trunc:
8133     case Instruction::UDiv:
8134     case Instruction::UIToFP:
8135     case Instruction::URem:
8136     case Instruction::Xor:
8137     case Instruction::ZExt:
8138       return true;
8139     }
8140     return false;
8141   };
8142 
8143   if (!IsVectorizableOpcode(I->getOpcode()))
8144     return nullptr;
8145 
8146   // Success: widen this instruction.
8147   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
8148 }
8149 
8150 VPBasicBlock *VPRecipeBuilder::handleReplication(
8151     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8152     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
8153     VPlanPtr &Plan) {
8154   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8155       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8156       Range);
8157 
8158   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8159       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
8160       Range);
8161 
8162   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8163                                        IsUniform, IsPredicated);
8164   setRecipe(I, Recipe);
8165   Plan->addVPValue(I, Recipe);
8166 
8167   // Find if I uses a predicated instruction. If so, it will use its scalar
8168   // value. Avoid hoisting the insert-element which packs the scalar value into
8169   // a vector value, as that happens iff all users use the vector value.
8170   for (auto &Op : I->operands())
8171     if (auto *PredInst = dyn_cast<Instruction>(Op))
8172       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
8173         PredInst2Recipe[PredInst]->setAlsoPack(false);
8174 
8175   // Finalize the recipe for Instr, first if it is not predicated.
8176   if (!IsPredicated) {
8177     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8178     VPBB->appendRecipe(Recipe);
8179     return VPBB;
8180   }
8181   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8182   assert(VPBB->getSuccessors().empty() &&
8183          "VPBB has successors when handling predicated replication.");
8184   // Record predicated instructions for above packing optimizations.
8185   PredInst2Recipe[I] = Recipe;
8186   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8187   VPBlockUtils::insertBlockAfter(Region, VPBB);
8188   auto *RegSucc = new VPBasicBlock();
8189   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8190   return RegSucc;
8191 }
8192 
8193 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8194                                                       VPRecipeBase *PredRecipe,
8195                                                       VPlanPtr &Plan) {
8196   // Instructions marked for predication are replicated and placed under an
8197   // if-then construct to prevent side-effects.
8198 
8199   // Generate recipes to compute the block mask for this region.
8200   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8201 
8202   // Build the triangular if-then region.
8203   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8204   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8205   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8206   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8207   auto *PHIRecipe = Instr->getType()->isVoidTy()
8208                         ? nullptr
8209                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8210   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8211   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8212   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8213 
8214   // Note: first set Entry as region entry and then connect successors starting
8215   // from it in order, to propagate the "parent" of each VPBasicBlock.
8216   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8217   VPBlockUtils::connectBlocks(Pred, Exit);
8218 
8219   return Region;
8220 }
8221 
8222 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8223                                                       VFRange &Range,
8224                                                       VPlanPtr &Plan) {
8225   // First, check for specific widening recipes that deal with calls, memory
8226   // operations, inductions and Phi nodes.
8227   if (auto *CI = dyn_cast<CallInst>(Instr))
8228     return tryToWidenCall(CI, Range, *Plan);
8229 
8230   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8231     return tryToWidenMemory(Instr, Range, Plan);
8232 
8233   VPRecipeBase *Recipe;
8234   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8235     if (Phi->getParent() != OrigLoop->getHeader())
8236       return tryToBlend(Phi, Plan);
8237     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
8238       return Recipe;
8239     return new VPWidenPHIRecipe(Phi);
8240   }
8241 
8242   if (isa<TruncInst>(Instr) &&
8243       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
8244     return Recipe;
8245 
8246   if (!shouldWiden(Instr, Range))
8247     return nullptr;
8248 
8249   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8250     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
8251                                 OrigLoop);
8252 
8253   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8254     bool InvariantCond =
8255         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8256     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
8257                                    InvariantCond);
8258   }
8259 
8260   return tryToWiden(Instr, *Plan);
8261 }
8262 
8263 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8264                                                         ElementCount MaxVF) {
8265   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8266 
8267   // Collect instructions from the original loop that will become trivially dead
8268   // in the vectorized loop. We don't need to vectorize these instructions. For
8269   // example, original induction update instructions can become dead because we
8270   // separately emit induction "steps" when generating code for the new loop.
8271   // Similarly, we create a new latch condition when setting up the structure
8272   // of the new loop, so the old one can become dead.
8273   SmallPtrSet<Instruction *, 4> DeadInstructions;
8274   collectTriviallyDeadInstructions(DeadInstructions);
8275 
8276   // Add assume instructions we need to drop to DeadInstructions, to prevent
8277   // them from being added to the VPlan.
8278   // TODO: We only need to drop assumes in blocks that get flattend. If the
8279   // control flow is preserved, we should keep them.
8280   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8281   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8282 
8283   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8284   // Dead instructions do not need sinking. Remove them from SinkAfter.
8285   for (Instruction *I : DeadInstructions)
8286     SinkAfter.erase(I);
8287 
8288   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8289   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8290     VFRange SubRange = {VF, MaxVFPlusOne};
8291     VPlans.push_back(
8292         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8293     VF = SubRange.End;
8294   }
8295 }
8296 
8297 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8298     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8299     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
8300 
8301   // Hold a mapping from predicated instructions to their recipes, in order to
8302   // fix their AlsoPack behavior if a user is determined to replicate and use a
8303   // scalar instead of vector value.
8304   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
8305 
8306   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8307 
8308   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8309 
8310   // ---------------------------------------------------------------------------
8311   // Pre-construction: record ingredients whose recipes we'll need to further
8312   // process after constructing the initial VPlan.
8313   // ---------------------------------------------------------------------------
8314 
8315   // Mark instructions we'll need to sink later and their targets as
8316   // ingredients whose recipe we'll need to record.
8317   for (auto &Entry : SinkAfter) {
8318     RecipeBuilder.recordRecipeOf(Entry.first);
8319     RecipeBuilder.recordRecipeOf(Entry.second);
8320   }
8321   for (auto &Reduction : CM.getInLoopReductionChains()) {
8322     PHINode *Phi = Reduction.first;
8323     RecurrenceDescriptor::RecurrenceKind Kind =
8324         Legal->getReductionVars()[Phi].getRecurrenceKind();
8325     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8326 
8327     RecipeBuilder.recordRecipeOf(Phi);
8328     for (auto &R : ReductionOperations) {
8329       RecipeBuilder.recordRecipeOf(R);
8330       // For min/max reducitons, where we have a pair of icmp/select, we also
8331       // need to record the ICmp recipe, so it can be removed later.
8332       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8333           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8334         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8335       }
8336     }
8337   }
8338 
8339   // For each interleave group which is relevant for this (possibly trimmed)
8340   // Range, add it to the set of groups to be later applied to the VPlan and add
8341   // placeholders for its members' Recipes which we'll be replacing with a
8342   // single VPInterleaveRecipe.
8343   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8344     auto applyIG = [IG, this](ElementCount VF) -> bool {
8345       return (VF.isVector() && // Query is illegal for VF == 1
8346               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8347                   LoopVectorizationCostModel::CM_Interleave);
8348     };
8349     if (!getDecisionAndClampRange(applyIG, Range))
8350       continue;
8351     InterleaveGroups.insert(IG);
8352     for (unsigned i = 0; i < IG->getFactor(); i++)
8353       if (Instruction *Member = IG->getMember(i))
8354         RecipeBuilder.recordRecipeOf(Member);
8355   };
8356 
8357   // ---------------------------------------------------------------------------
8358   // Build initial VPlan: Scan the body of the loop in a topological order to
8359   // visit each basic block after having visited its predecessor basic blocks.
8360   // ---------------------------------------------------------------------------
8361 
8362   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
8363   auto Plan = std::make_unique<VPlan>();
8364   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
8365   Plan->setEntry(VPBB);
8366 
8367   // Scan the body of the loop in a topological order to visit each basic block
8368   // after having visited its predecessor basic blocks.
8369   LoopBlocksDFS DFS(OrigLoop);
8370   DFS.perform(LI);
8371 
8372   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8373     // Relevant instructions from basic block BB will be grouped into VPRecipe
8374     // ingredients and fill a new VPBasicBlock.
8375     unsigned VPBBsForBB = 0;
8376     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8377     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
8378     VPBB = FirstVPBBForBB;
8379     Builder.setInsertPoint(VPBB);
8380 
8381     // Introduce each ingredient into VPlan.
8382     // TODO: Model and preserve debug instrinsics in VPlan.
8383     for (Instruction &I : BB->instructionsWithoutDebug()) {
8384       Instruction *Instr = &I;
8385 
8386       // First filter out irrelevant instructions, to ensure no recipes are
8387       // built for them.
8388       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8389         continue;
8390 
8391       if (auto Recipe =
8392               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
8393         // Check if the recipe can be converted to a VPValue. We need the extra
8394         // down-casting step until VPRecipeBase inherits from VPValue.
8395         VPValue *MaybeVPValue = Recipe->toVPValue();
8396         if (!Instr->getType()->isVoidTy() && MaybeVPValue)
8397           Plan->addVPValue(Instr, MaybeVPValue);
8398 
8399         RecipeBuilder.setRecipe(Instr, Recipe);
8400         VPBB->appendRecipe(Recipe);
8401         continue;
8402       }
8403 
8404       // Otherwise, if all widening options failed, Instruction is to be
8405       // replicated. This may create a successor for VPBB.
8406       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
8407           Instr, Range, VPBB, PredInst2Recipe, Plan);
8408       if (NextVPBB != VPBB) {
8409         VPBB = NextVPBB;
8410         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8411                                     : "");
8412       }
8413     }
8414   }
8415 
8416   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8417   // may also be empty, such as the last one VPBB, reflecting original
8418   // basic-blocks with no recipes.
8419   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8420   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
8421   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8422   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
8423   delete PreEntry;
8424 
8425   // ---------------------------------------------------------------------------
8426   // Transform initial VPlan: Apply previously taken decisions, in order, to
8427   // bring the VPlan to its final state.
8428   // ---------------------------------------------------------------------------
8429 
8430   // Apply Sink-After legal constraints.
8431   for (auto &Entry : SinkAfter) {
8432     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8433     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8434     Sink->moveAfter(Target);
8435   }
8436 
8437   // Interleave memory: for each Interleave Group we marked earlier as relevant
8438   // for this VPlan, replace the Recipes widening its memory instructions with a
8439   // single VPInterleaveRecipe at its insertion point.
8440   for (auto IG : InterleaveGroups) {
8441     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8442         RecipeBuilder.getRecipe(IG->getInsertPos()));
8443     SmallVector<VPValue *, 4> StoredValues;
8444     for (unsigned i = 0; i < IG->getFactor(); ++i)
8445       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
8446         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
8447 
8448     (new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8449                             Recipe->getMask()))
8450         ->insertBefore(Recipe);
8451 
8452     for (unsigned i = 0; i < IG->getFactor(); ++i)
8453       if (Instruction *Member = IG->getMember(i)) {
8454         if (!Member->getType()->isVoidTy()) {
8455           VPValue *OriginalV = Plan->getVPValue(Member);
8456           Plan->removeVPValueFor(Member);
8457           OriginalV->replaceAllUsesWith(Plan->getOrAddVPValue(Member));
8458         }
8459         RecipeBuilder.getRecipe(Member)->eraseFromParent();
8460       }
8461   }
8462 
8463   // Adjust the recipes for any inloop reductions.
8464   if (Range.Start.isVector())
8465     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
8466 
8467   // Finally, if tail is folded by masking, introduce selects between the phi
8468   // and the live-out instruction of each reduction, at the end of the latch.
8469   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
8470     Builder.setInsertPoint(VPBB);
8471     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
8472     for (auto &Reduction : Legal->getReductionVars()) {
8473       if (CM.isInLoopReduction(Reduction.first))
8474         continue;
8475       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
8476       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
8477       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
8478     }
8479   }
8480 
8481   std::string PlanName;
8482   raw_string_ostream RSO(PlanName);
8483   ElementCount VF = Range.Start;
8484   Plan->addVF(VF);
8485   RSO << "Initial VPlan for VF={" << VF;
8486   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
8487     Plan->addVF(VF);
8488     RSO << "," << VF;
8489   }
8490   RSO << "},UF>=1";
8491   RSO.flush();
8492   Plan->setName(PlanName);
8493 
8494   return Plan;
8495 }
8496 
8497 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8498   // Outer loop handling: They may require CFG and instruction level
8499   // transformations before even evaluating whether vectorization is profitable.
8500   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8501   // the vectorization pipeline.
8502   assert(!OrigLoop->isInnermost());
8503   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8504 
8505   // Create new empty VPlan
8506   auto Plan = std::make_unique<VPlan>();
8507 
8508   // Build hierarchical CFG
8509   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8510   HCFGBuilder.buildHierarchicalCFG();
8511 
8512   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
8513        VF *= 2)
8514     Plan->addVF(VF);
8515 
8516   if (EnableVPlanPredication) {
8517     VPlanPredicator VPP(*Plan);
8518     VPP.predicate();
8519 
8520     // Avoid running transformation to recipes until masked code generation in
8521     // VPlan-native path is in place.
8522     return Plan;
8523   }
8524 
8525   SmallPtrSet<Instruction *, 1> DeadInstructions;
8526   VPlanTransforms::VPInstructionsToVPRecipes(
8527       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
8528   return Plan;
8529 }
8530 
8531 // Adjust the recipes for any inloop reductions. The chain of instructions
8532 // leading from the loop exit instr to the phi need to be converted to
8533 // reductions, with one operand being vector and the other being the scalar
8534 // reduction chain.
8535 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
8536     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
8537   for (auto &Reduction : CM.getInLoopReductionChains()) {
8538     PHINode *Phi = Reduction.first;
8539     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8540     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8541 
8542     // ReductionOperations are orders top-down from the phi's use to the
8543     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
8544     // which of the two operands will remain scalar and which will be reduced.
8545     // For minmax the chain will be the select instructions.
8546     Instruction *Chain = Phi;
8547     for (Instruction *R : ReductionOperations) {
8548       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
8549       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
8550 
8551       VPValue *ChainOp = Plan->getVPValue(Chain);
8552       unsigned FirstOpId;
8553       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8554           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8555         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
8556                "Expected to replace a VPWidenSelectSC");
8557         FirstOpId = 1;
8558       } else {
8559         assert(isa<VPWidenRecipe>(WidenRecipe) &&
8560                "Expected to replace a VPWidenSC");
8561         FirstOpId = 0;
8562       }
8563       unsigned VecOpId =
8564           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
8565       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
8566 
8567       auto *CondOp = CM.foldTailByMasking()
8568                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
8569                          : nullptr;
8570       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
8571           &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
8572       WidenRecipe->toVPValue()->replaceAllUsesWith(RedRecipe);
8573       Plan->removeVPValueFor(R);
8574       Plan->addVPValue(R, RedRecipe);
8575       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
8576       WidenRecipe->eraseFromParent();
8577 
8578       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8579           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8580         VPRecipeBase *CompareRecipe =
8581             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
8582         assert(isa<VPWidenRecipe>(CompareRecipe) &&
8583                "Expected to replace a VPWidenSC");
8584         assert(CompareRecipe->toVPValue()->getNumUsers() == 0 &&
8585                "Expected no remaining users");
8586         CompareRecipe->eraseFromParent();
8587       }
8588       Chain = R;
8589     }
8590   }
8591 }
8592 
8593 Value* LoopVectorizationPlanner::VPCallbackILV::
8594 getOrCreateVectorValues(Value *V, unsigned Part) {
8595       return ILV.getOrCreateVectorValue(V, Part);
8596 }
8597 
8598 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
8599     Value *V, const VPIteration &Instance) {
8600   return ILV.getOrCreateScalarValue(V, Instance);
8601 }
8602 
8603 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
8604                                VPSlotTracker &SlotTracker) const {
8605   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
8606   IG->getInsertPos()->printAsOperand(O, false);
8607   O << ", ";
8608   getAddr()->printAsOperand(O, SlotTracker);
8609   VPValue *Mask = getMask();
8610   if (Mask) {
8611     O << ", ";
8612     Mask->printAsOperand(O, SlotTracker);
8613   }
8614   for (unsigned i = 0; i < IG->getFactor(); ++i)
8615     if (Instruction *I = IG->getMember(i))
8616       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
8617 }
8618 
8619 void VPWidenCallRecipe::execute(VPTransformState &State) {
8620   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
8621                                   *this, State);
8622 }
8623 
8624 void VPWidenSelectRecipe::execute(VPTransformState &State) {
8625   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
8626                                     this, *this, InvariantCond, State);
8627 }
8628 
8629 void VPWidenRecipe::execute(VPTransformState &State) {
8630   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
8631 }
8632 
8633 void VPWidenGEPRecipe::execute(VPTransformState &State) {
8634   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
8635                       *this, State.UF, State.VF, IsPtrLoopInvariant,
8636                       IsIndexLoopInvariant, State);
8637 }
8638 
8639 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
8640   assert(!State.Instance && "Int or FP induction being replicated.");
8641   State.ILV->widenIntOrFpInduction(IV, Trunc);
8642 }
8643 
8644 void VPWidenPHIRecipe::execute(VPTransformState &State) {
8645   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
8646 }
8647 
8648 void VPBlendRecipe::execute(VPTransformState &State) {
8649   State.ILV->setDebugLocFromInst(State.Builder, Phi);
8650   // We know that all PHIs in non-header blocks are converted into
8651   // selects, so we don't have to worry about the insertion order and we
8652   // can just use the builder.
8653   // At this point we generate the predication tree. There may be
8654   // duplications since this is a simple recursive scan, but future
8655   // optimizations will clean it up.
8656 
8657   unsigned NumIncoming = getNumIncomingValues();
8658 
8659   // Generate a sequence of selects of the form:
8660   // SELECT(Mask3, In3,
8661   //        SELECT(Mask2, In2,
8662   //               SELECT(Mask1, In1,
8663   //                      In0)))
8664   // Note that Mask0 is never used: lanes for which no path reaches this phi and
8665   // are essentially undef are taken from In0.
8666   InnerLoopVectorizer::VectorParts Entry(State.UF);
8667   for (unsigned In = 0; In < NumIncoming; ++In) {
8668     for (unsigned Part = 0; Part < State.UF; ++Part) {
8669       // We might have single edge PHIs (blocks) - use an identity
8670       // 'select' for the first PHI operand.
8671       Value *In0 = State.get(getIncomingValue(In), Part);
8672       if (In == 0)
8673         Entry[Part] = In0; // Initialize with the first incoming value.
8674       else {
8675         // Select between the current value and the previous incoming edge
8676         // based on the incoming mask.
8677         Value *Cond = State.get(getMask(In), Part);
8678         Entry[Part] =
8679             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8680       }
8681     }
8682   }
8683   for (unsigned Part = 0; Part < State.UF; ++Part)
8684     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8685 }
8686 
8687 void VPInterleaveRecipe::execute(VPTransformState &State) {
8688   assert(!State.Instance && "Interleave group being replicated.");
8689   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getStoredValues(),
8690                                       getMask());
8691 }
8692 
8693 void VPReductionRecipe::execute(VPTransformState &State) {
8694   assert(!State.Instance && "Reduction being replicated.");
8695   for (unsigned Part = 0; Part < State.UF; ++Part) {
8696     RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind();
8697     Value *NewVecOp = State.get(getVecOp(), Part);
8698     if (VPValue *Cond = getCondOp()) {
8699       Value *NewCond = State.get(Cond, Part);
8700       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
8701       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
8702           Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType());
8703       Constant *IdenVec =
8704           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
8705       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
8706       NewVecOp = Select;
8707     }
8708     Value *NewRed =
8709         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
8710     Value *PrevInChain = State.get(getChainOp(), Part);
8711     Value *NextInChain;
8712     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8713         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8714       NextInChain =
8715           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
8716                          NewRed, PrevInChain);
8717     } else {
8718       NextInChain = State.Builder.CreateBinOp(
8719           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
8720           PrevInChain);
8721     }
8722     State.set(this, getUnderlyingInstr(), NextInChain, Part);
8723   }
8724 }
8725 
8726 void VPReplicateRecipe::execute(VPTransformState &State) {
8727   if (State.Instance) { // Generate a single instance.
8728     State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
8729                                     *State.Instance, IsPredicated, State);
8730     // Insert scalar instance packing it into a vector.
8731     if (AlsoPack && State.VF.isVector()) {
8732       // If we're constructing lane 0, initialize to start from undef.
8733       if (State.Instance->Lane == 0) {
8734         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8735         Value *Undef = UndefValue::get(
8736             VectorType::get(getUnderlyingValue()->getType(), State.VF));
8737         State.ValueMap.setVectorValue(getUnderlyingInstr(),
8738                                       State.Instance->Part, Undef);
8739       }
8740       State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
8741                                            *State.Instance);
8742     }
8743     return;
8744   }
8745 
8746   // Generate scalar instances for all VF lanes of all UF parts, unless the
8747   // instruction is uniform inwhich case generate only the first lane for each
8748   // of the UF parts.
8749   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8750   for (unsigned Part = 0; Part < State.UF; ++Part)
8751     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8752       State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
8753                                       IsPredicated, State);
8754 }
8755 
8756 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8757   assert(State.Instance && "Branch on Mask works only on single instance.");
8758 
8759   unsigned Part = State.Instance->Part;
8760   unsigned Lane = State.Instance->Lane;
8761 
8762   Value *ConditionBit = nullptr;
8763   VPValue *BlockInMask = getMask();
8764   if (BlockInMask) {
8765     ConditionBit = State.get(BlockInMask, Part);
8766     if (ConditionBit->getType()->isVectorTy())
8767       ConditionBit = State.Builder.CreateExtractElement(
8768           ConditionBit, State.Builder.getInt32(Lane));
8769   } else // Block in mask is all-one.
8770     ConditionBit = State.Builder.getTrue();
8771 
8772   // Replace the temporary unreachable terminator with a new conditional branch,
8773   // whose two destinations will be set later when they are created.
8774   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8775   assert(isa<UnreachableInst>(CurrentTerminator) &&
8776          "Expected to replace unreachable terminator with conditional branch.");
8777   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8778   CondBr->setSuccessor(0, nullptr);
8779   ReplaceInstWithInst(CurrentTerminator, CondBr);
8780 }
8781 
8782 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8783   assert(State.Instance && "Predicated instruction PHI works per instance.");
8784   Instruction *ScalarPredInst =
8785       cast<Instruction>(State.get(getOperand(0), *State.Instance));
8786   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8787   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8788   assert(PredicatingBB && "Predicated block has no single predecessor.");
8789 
8790   // By current pack/unpack logic we need to generate only a single phi node: if
8791   // a vector value for the predicated instruction exists at this point it means
8792   // the instruction has vector users only, and a phi for the vector value is
8793   // needed. In this case the recipe of the predicated instruction is marked to
8794   // also do that packing, thereby "hoisting" the insert-element sequence.
8795   // Otherwise, a phi node for the scalar value is needed.
8796   unsigned Part = State.Instance->Part;
8797   Instruction *PredInst =
8798       cast<Instruction>(getOperand(0)->getUnderlyingValue());
8799   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8800     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8801     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8802     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8803     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8804     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8805     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8806   } else {
8807     Type *PredInstType = PredInst->getType();
8808     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8809     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8810     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8811     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8812   }
8813 }
8814 
8815 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8816   Instruction *Instr = getUnderlyingInstr();
8817   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
8818   State.ILV->vectorizeMemoryInstruction(Instr, State,
8819                                         StoredValue ? nullptr : this, getAddr(),
8820                                         StoredValue, getMask());
8821 }
8822 
8823 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8824 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8825 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8826 // for predication.
8827 static ScalarEpilogueLowering getScalarEpilogueLowering(
8828     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8829     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8830     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8831     LoopVectorizationLegality &LVL) {
8832   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8833   // don't look at hints or options, and don't request a scalar epilogue.
8834   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8835   // LoopAccessInfo (due to code dependency and not being able to reliably get
8836   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8837   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8838   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8839   // back to the old way and vectorize with versioning when forced. See D81345.)
8840   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8841                                                       PGSOQueryType::IRPass) &&
8842                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8843     return CM_ScalarEpilogueNotAllowedOptSize;
8844 
8845   bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() &&
8846                               !PreferPredicateOverEpilogue;
8847 
8848   // 2) Next, if disabling predication is requested on the command line, honour
8849   // this and request a scalar epilogue.
8850   if (PredicateOptDisabled)
8851     return CM_ScalarEpilogueAllowed;
8852 
8853   // 3) and 4) look if enabling predication is requested on the command line,
8854   // with a loop hint, or if the TTI hook indicates this is profitable, request
8855   // predication.
8856   if (PreferPredicateOverEpilogue ||
8857       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
8858       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8859                                         LVL.getLAI()) &&
8860        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
8861     return CM_ScalarEpilogueNotNeededUsePredicate;
8862 
8863   return CM_ScalarEpilogueAllowed;
8864 }
8865 
8866 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
8867                            unsigned Part) {
8868   set(Def, V, Part);
8869   ILV->setVectorValue(IRDef, Part, V);
8870 }
8871 
8872 // Process the loop in the VPlan-native vectorization path. This path builds
8873 // VPlan upfront in the vectorization pipeline, which allows to apply
8874 // VPlan-to-VPlan transformations from the very beginning without modifying the
8875 // input LLVM IR.
8876 static bool processLoopInVPlanNativePath(
8877     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8878     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8879     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8880     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8881     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8882 
8883   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
8884     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8885     return false;
8886   }
8887   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8888   Function *F = L->getHeader()->getParent();
8889   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8890 
8891   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8892       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8893 
8894   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8895                                 &Hints, IAI);
8896   // Use the planner for outer loop vectorization.
8897   // TODO: CM is not used at this point inside the planner. Turn CM into an
8898   // optional argument if we don't need it in the future.
8899   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8900 
8901   // Get user vectorization factor.
8902   ElementCount UserVF = Hints.getWidth();
8903   if (UserVF.isScalable()) {
8904     // TODO: Use scalable UserVF once we've added initial support for scalable
8905     // vectorization. For now we convert it to fixed width, but this will be
8906     // removed in a later patch.
8907     UserVF = ElementCount::getFixed(UserVF.getKnownMinValue());
8908   }
8909 
8910   // Plan how to best vectorize, return the best VF and its cost.
8911   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
8912 
8913   // If we are stress testing VPlan builds, do not attempt to generate vector
8914   // code. Masked vector code generation support will follow soon.
8915   // Also, do not attempt to vectorize if no vector code will be produced.
8916   if (VPlanBuildStressTest || EnableVPlanPredication ||
8917       VectorizationFactor::Disabled() == VF)
8918     return false;
8919 
8920   LVP.setBestPlan(VF.Width, 1);
8921 
8922   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8923                          &CM, BFI, PSI);
8924   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8925                     << L->getHeader()->getParent()->getName() << "\"\n");
8926   LVP.executePlan(LB, DT);
8927 
8928   // Mark the loop as already vectorized to avoid vectorizing again.
8929   Hints.setAlreadyVectorized();
8930 
8931   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8932   return true;
8933 }
8934 
8935 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8936     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8937                                !EnableLoopInterleaving),
8938       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8939                               !EnableLoopVectorization) {}
8940 
8941 bool LoopVectorizePass::processLoop(Loop *L) {
8942   assert((EnableVPlanNativePath || L->isInnermost()) &&
8943          "VPlan-native path is not enabled. Only process inner loops.");
8944 
8945 #ifndef NDEBUG
8946   const std::string DebugLocStr = getDebugLocString(L);
8947 #endif /* NDEBUG */
8948 
8949   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8950                     << L->getHeader()->getParent()->getName() << "\" from "
8951                     << DebugLocStr << "\n");
8952 
8953   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8954 
8955   LLVM_DEBUG(
8956       dbgs() << "LV: Loop hints:"
8957              << " force="
8958              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8959                      ? "disabled"
8960                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8961                             ? "enabled"
8962                             : "?"))
8963              << " width=" << Hints.getWidth()
8964              << " unroll=" << Hints.getInterleave() << "\n");
8965 
8966   // Function containing loop
8967   Function *F = L->getHeader()->getParent();
8968 
8969   // Looking at the diagnostic output is the only way to determine if a loop
8970   // was vectorized (other than looking at the IR or machine code), so it
8971   // is important to generate an optimization remark for each loop. Most of
8972   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8973   // generated as OptimizationRemark and OptimizationRemarkMissed are
8974   // less verbose reporting vectorized loops and unvectorized loops that may
8975   // benefit from vectorization, respectively.
8976 
8977   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8978     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8979     return false;
8980   }
8981 
8982   PredicatedScalarEvolution PSE(*SE, *L);
8983 
8984   // Check if it is legal to vectorize the loop.
8985   LoopVectorizationRequirements Requirements(*ORE);
8986   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
8987                                 &Requirements, &Hints, DB, AC, BFI, PSI);
8988   if (!LVL.canVectorize(EnableVPlanNativePath)) {
8989     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8990     Hints.emitRemarkWithHints();
8991     return false;
8992   }
8993 
8994   // Check the function attributes and profiles to find out if this function
8995   // should be optimized for size.
8996   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8997       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
8998 
8999   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9000   // here. They may require CFG and instruction level transformations before
9001   // even evaluating whether vectorization is profitable. Since we cannot modify
9002   // the incoming IR, we need to build VPlan upfront in the vectorization
9003   // pipeline.
9004   if (!L->isInnermost())
9005     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9006                                         ORE, BFI, PSI, Hints);
9007 
9008   assert(L->isInnermost() && "Inner loop expected.");
9009 
9010   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9011   // count by optimizing for size, to minimize overheads.
9012   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9013   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9014     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9015                       << "This loop is worth vectorizing only if no scalar "
9016                       << "iteration overheads are incurred.");
9017     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9018       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9019     else {
9020       LLVM_DEBUG(dbgs() << "\n");
9021       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9022     }
9023   }
9024 
9025   // Check the function attributes to see if implicit floats are allowed.
9026   // FIXME: This check doesn't seem possibly correct -- what if the loop is
9027   // an integer loop and the vector instructions selected are purely integer
9028   // vector instructions?
9029   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9030     reportVectorizationFailure(
9031         "Can't vectorize when the NoImplicitFloat attribute is used",
9032         "loop not vectorized due to NoImplicitFloat attribute",
9033         "NoImplicitFloat", ORE, L);
9034     Hints.emitRemarkWithHints();
9035     return false;
9036   }
9037 
9038   // Check if the target supports potentially unsafe FP vectorization.
9039   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9040   // for the target we're vectorizing for, to make sure none of the
9041   // additional fp-math flags can help.
9042   if (Hints.isPotentiallyUnsafe() &&
9043       TTI->isFPVectorizationPotentiallyUnsafe()) {
9044     reportVectorizationFailure(
9045         "Potentially unsafe FP op prevents vectorization",
9046         "loop not vectorized due to unsafe FP support.",
9047         "UnsafeFP", ORE, L);
9048     Hints.emitRemarkWithHints();
9049     return false;
9050   }
9051 
9052   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9053   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9054 
9055   // If an override option has been passed in for interleaved accesses, use it.
9056   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9057     UseInterleaved = EnableInterleavedMemAccesses;
9058 
9059   // Analyze interleaved memory accesses.
9060   if (UseInterleaved) {
9061     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9062   }
9063 
9064   // Use the cost model.
9065   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9066                                 F, &Hints, IAI);
9067   CM.collectValuesToIgnore();
9068 
9069   // Use the planner for vectorization.
9070   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
9071 
9072   // Get user vectorization factor and interleave count.
9073   ElementCount UserVF = Hints.getWidth();
9074   if (UserVF.isScalable()) {
9075     // TODO: Use scalable UserVF once we've added initial support for scalable
9076     // vectorization. For now we convert it to fixed width, but this will be
9077     // removed in a later patch.
9078     UserVF = ElementCount::getFixed(UserVF.getKnownMinValue());
9079   }
9080 
9081   unsigned UserIC = Hints.getInterleave();
9082 
9083   // Plan how to best vectorize, return the best VF and its cost.
9084   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9085 
9086   VectorizationFactor VF = VectorizationFactor::Disabled();
9087   unsigned IC = 1;
9088 
9089   if (MaybeVF) {
9090     VF = *MaybeVF;
9091     // Select the interleave count.
9092     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9093   }
9094 
9095   // Identify the diagnostic messages that should be produced.
9096   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9097   bool VectorizeLoop = true, InterleaveLoop = true;
9098   if (Requirements.doesNotMeet(F, L, Hints)) {
9099     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
9100                          "requirements.\n");
9101     Hints.emitRemarkWithHints();
9102     return false;
9103   }
9104 
9105   if (VF.Width.isScalar()) {
9106     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9107     VecDiagMsg = std::make_pair(
9108         "VectorizationNotBeneficial",
9109         "the cost-model indicates that vectorization is not beneficial");
9110     VectorizeLoop = false;
9111   }
9112 
9113   if (!MaybeVF && UserIC > 1) {
9114     // Tell the user interleaving was avoided up-front, despite being explicitly
9115     // requested.
9116     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9117                          "interleaving should be avoided up front\n");
9118     IntDiagMsg = std::make_pair(
9119         "InterleavingAvoided",
9120         "Ignoring UserIC, because interleaving was avoided up front");
9121     InterleaveLoop = false;
9122   } else if (IC == 1 && UserIC <= 1) {
9123     // Tell the user interleaving is not beneficial.
9124     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9125     IntDiagMsg = std::make_pair(
9126         "InterleavingNotBeneficial",
9127         "the cost-model indicates that interleaving is not beneficial");
9128     InterleaveLoop = false;
9129     if (UserIC == 1) {
9130       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9131       IntDiagMsg.second +=
9132           " and is explicitly disabled or interleave count is set to 1";
9133     }
9134   } else if (IC > 1 && UserIC == 1) {
9135     // Tell the user interleaving is beneficial, but it explicitly disabled.
9136     LLVM_DEBUG(
9137         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9138     IntDiagMsg = std::make_pair(
9139         "InterleavingBeneficialButDisabled",
9140         "the cost-model indicates that interleaving is beneficial "
9141         "but is explicitly disabled or interleave count is set to 1");
9142     InterleaveLoop = false;
9143   }
9144 
9145   // Override IC if user provided an interleave count.
9146   IC = UserIC > 0 ? UserIC : IC;
9147 
9148   // Emit diagnostic messages, if any.
9149   const char *VAPassName = Hints.vectorizeAnalysisPassName();
9150   if (!VectorizeLoop && !InterleaveLoop) {
9151     // Do not vectorize or interleaving the loop.
9152     ORE->emit([&]() {
9153       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9154                                       L->getStartLoc(), L->getHeader())
9155              << VecDiagMsg.second;
9156     });
9157     ORE->emit([&]() {
9158       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9159                                       L->getStartLoc(), L->getHeader())
9160              << IntDiagMsg.second;
9161     });
9162     return false;
9163   } else if (!VectorizeLoop && InterleaveLoop) {
9164     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9165     ORE->emit([&]() {
9166       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9167                                         L->getStartLoc(), L->getHeader())
9168              << VecDiagMsg.second;
9169     });
9170   } else if (VectorizeLoop && !InterleaveLoop) {
9171     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9172                       << ") in " << DebugLocStr << '\n');
9173     ORE->emit([&]() {
9174       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9175                                         L->getStartLoc(), L->getHeader())
9176              << IntDiagMsg.second;
9177     });
9178   } else if (VectorizeLoop && InterleaveLoop) {
9179     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9180                       << ") in " << DebugLocStr << '\n');
9181     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9182   }
9183 
9184   LVP.setBestPlan(VF.Width, IC);
9185 
9186   using namespace ore;
9187   bool DisableRuntimeUnroll = false;
9188   MDNode *OrigLoopID = L->getLoopID();
9189 
9190   if (!VectorizeLoop) {
9191     assert(IC > 1 && "interleave count should not be 1 or 0");
9192     // If we decided that it is not legal to vectorize the loop, then
9193     // interleave it.
9194     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
9195                                BFI, PSI);
9196     LVP.executePlan(Unroller, DT);
9197 
9198     ORE->emit([&]() {
9199       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9200                                 L->getHeader())
9201              << "interleaved loop (interleaved count: "
9202              << NV("InterleaveCount", IC) << ")";
9203     });
9204   } else {
9205     // If we decided that it is *legal* to vectorize the loop, then do it.
9206 
9207     // Consider vectorizing the epilogue too if it's profitable.
9208     VectorizationFactor EpilogueVF =
9209       CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
9210     if (EpilogueVF.Width.isVector()) {
9211 
9212       // The first pass vectorizes the main loop and creates a scalar epilogue
9213       // to be vectorized by executing the plan (potentially with a different
9214       // factor) again shortly afterwards.
9215       EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
9216                                         EpilogueVF.Width.getKnownMinValue(), 1);
9217       EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
9218                                          &LVL, &CM, BFI, PSI);
9219 
9220       LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
9221       LVP.executePlan(MainILV, DT);
9222       ++LoopsVectorized;
9223 
9224       simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9225       formLCSSARecursively(*L, *DT, LI, SE);
9226 
9227       // Second pass vectorizes the epilogue and adjusts the control flow
9228       // edges from the first pass.
9229       LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
9230       EPI.MainLoopVF = EPI.EpilogueVF;
9231       EPI.MainLoopUF = EPI.EpilogueUF;
9232       EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9233                                                ORE, EPI, &LVL, &CM, BFI, PSI);
9234       LVP.executePlan(EpilogILV, DT);
9235       ++LoopsEpilogueVectorized;
9236 
9237       if (!MainILV.areSafetyChecksAdded())
9238         DisableRuntimeUnroll = true;
9239     } else {
9240       InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
9241                              &LVL, &CM, BFI, PSI);
9242       LVP.executePlan(LB, DT);
9243       ++LoopsVectorized;
9244 
9245       // Add metadata to disable runtime unrolling a scalar loop when there are
9246       // no runtime checks about strides and memory. A scalar loop that is
9247       // rarely used is not worth unrolling.
9248       if (!LB.areSafetyChecksAdded())
9249         DisableRuntimeUnroll = true;
9250     }
9251 
9252     // Report the vectorization decision.
9253     ORE->emit([&]() {
9254       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
9255                                 L->getHeader())
9256              << "vectorized loop (vectorization width: "
9257              << NV("VectorizationFactor", VF.Width)
9258              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
9259     });
9260   }
9261 
9262   Optional<MDNode *> RemainderLoopID =
9263       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
9264                                       LLVMLoopVectorizeFollowupEpilogue});
9265   if (RemainderLoopID.hasValue()) {
9266     L->setLoopID(RemainderLoopID.getValue());
9267   } else {
9268     if (DisableRuntimeUnroll)
9269       AddRuntimeUnrollDisableMetaData(L);
9270 
9271     // Mark the loop as already vectorized to avoid vectorizing again.
9272     Hints.setAlreadyVectorized();
9273   }
9274 
9275   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9276   return true;
9277 }
9278 
9279 LoopVectorizeResult LoopVectorizePass::runImpl(
9280     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
9281     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
9282     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
9283     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
9284     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
9285   SE = &SE_;
9286   LI = &LI_;
9287   TTI = &TTI_;
9288   DT = &DT_;
9289   BFI = &BFI_;
9290   TLI = TLI_;
9291   AA = &AA_;
9292   AC = &AC_;
9293   GetLAA = &GetLAA_;
9294   DB = &DB_;
9295   ORE = &ORE_;
9296   PSI = PSI_;
9297 
9298   // Don't attempt if
9299   // 1. the target claims to have no vector registers, and
9300   // 2. interleaving won't help ILP.
9301   //
9302   // The second condition is necessary because, even if the target has no
9303   // vector registers, loop vectorization may still enable scalar
9304   // interleaving.
9305   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9306       TTI->getMaxInterleaveFactor(1) < 2)
9307     return LoopVectorizeResult(false, false);
9308 
9309   bool Changed = false, CFGChanged = false;
9310 
9311   // The vectorizer requires loops to be in simplified form.
9312   // Since simplification may add new inner loops, it has to run before the
9313   // legality and profitability checks. This means running the loop vectorizer
9314   // will simplify all loops, regardless of whether anything end up being
9315   // vectorized.
9316   for (auto &L : *LI)
9317     Changed |= CFGChanged |=
9318         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9319 
9320   // Build up a worklist of inner-loops to vectorize. This is necessary as
9321   // the act of vectorizing or partially unrolling a loop creates new loops
9322   // and can invalidate iterators across the loops.
9323   SmallVector<Loop *, 8> Worklist;
9324 
9325   for (Loop *L : *LI)
9326     collectSupportedLoops(*L, LI, ORE, Worklist);
9327 
9328   LoopsAnalyzed += Worklist.size();
9329 
9330   // Now walk the identified inner loops.
9331   while (!Worklist.empty()) {
9332     Loop *L = Worklist.pop_back_val();
9333 
9334     // For the inner loops we actually process, form LCSSA to simplify the
9335     // transform.
9336     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
9337 
9338     Changed |= CFGChanged |= processLoop(L);
9339   }
9340 
9341   // Process each loop nest in the function.
9342   return LoopVectorizeResult(Changed, CFGChanged);
9343 }
9344 
9345 PreservedAnalyses LoopVectorizePass::run(Function &F,
9346                                          FunctionAnalysisManager &AM) {
9347     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
9348     auto &LI = AM.getResult<LoopAnalysis>(F);
9349     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
9350     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
9351     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
9352     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
9353     auto &AA = AM.getResult<AAManager>(F);
9354     auto &AC = AM.getResult<AssumptionAnalysis>(F);
9355     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
9356     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
9357     MemorySSA *MSSA = EnableMSSALoopDependency
9358                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
9359                           : nullptr;
9360 
9361     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
9362     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
9363         [&](Loop &L) -> const LoopAccessInfo & {
9364       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
9365                                         TLI, TTI, nullptr, MSSA};
9366       return LAM.getResult<LoopAccessAnalysis>(L, AR);
9367     };
9368     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
9369     ProfileSummaryInfo *PSI =
9370         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
9371     LoopVectorizeResult Result =
9372         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
9373     if (!Result.MadeAnyChange)
9374       return PreservedAnalyses::all();
9375     PreservedAnalyses PA;
9376 
9377     // We currently do not preserve loopinfo/dominator analyses with outer loop
9378     // vectorization. Until this is addressed, mark these analyses as preserved
9379     // only for non-VPlan-native path.
9380     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
9381     if (!EnableVPlanNativePath) {
9382       PA.preserve<LoopAnalysis>();
9383       PA.preserve<DominatorTreeAnalysis>();
9384     }
9385     PA.preserve<BasicAA>();
9386     PA.preserve<GlobalsAA>();
9387     if (!Result.MadeCFGChange)
9388       PA.preserveSet<CFGAnalyses>();
9389     return PA;
9390 }
9391