1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
182 // that predication is preferred, and this lists all options. I.e., the
183 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
184 // and predicate the instructions accordingly. If tail-folding fails, there are
185 // different fallback strategies depending on these values:
186 namespace PreferPredicateTy {
187   enum Option {
188     ScalarEpilogue = 0,
189     PredicateElseScalarEpilogue,
190     PredicateOrDontVectorize
191   };
192 }
193 
194 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
195     "prefer-predicate-over-epilogue",
196     cl::init(PreferPredicateTy::ScalarEpilogue),
197     cl::Hidden,
198     cl::desc("Tail-folding and predication preferences over creating a scalar "
199              "epilogue loop."),
200     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
201                          "scalar-epilogue",
202                          "Don't tail-predicate loops, create scalar epilogue"),
203               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
204                          "predicate-else-scalar-epilogue",
205                          "prefer tail-folding, create scalar epilogue if tail "
206                          "folding fails."),
207               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
208                          "predicate-dont-vectorize",
209                          "prefers tail-folding, don't attempt vectorization if "
210                          "tail-folding fails.")));
211 
212 static cl::opt<bool> MaximizeBandwidth(
213     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
214     cl::desc("Maximize bandwidth when selecting vectorization factor which "
215              "will be determined by the smallest type in loop."));
216 
217 static cl::opt<bool> EnableInterleavedMemAccesses(
218     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
219     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
220 
221 /// An interleave-group may need masking if it resides in a block that needs
222 /// predication, or in order to mask away gaps.
223 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
224     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
225     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
226 
227 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
228     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
229     cl::desc("We don't interleave loops with a estimated constant trip count "
230              "below this number"));
231 
232 static cl::opt<unsigned> ForceTargetNumScalarRegs(
233     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
234     cl::desc("A flag that overrides the target's number of scalar registers."));
235 
236 static cl::opt<unsigned> ForceTargetNumVectorRegs(
237     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
238     cl::desc("A flag that overrides the target's number of vector registers."));
239 
240 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
241     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
242     cl::desc("A flag that overrides the target's max interleave factor for "
243              "scalar loops."));
244 
245 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
246     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
247     cl::desc("A flag that overrides the target's max interleave factor for "
248              "vectorized loops."));
249 
250 static cl::opt<unsigned> ForceTargetInstructionCost(
251     "force-target-instruction-cost", cl::init(0), cl::Hidden,
252     cl::desc("A flag that overrides the target's expected cost for "
253              "an instruction to a single constant value. Mostly "
254              "useful for getting consistent testing."));
255 
256 static cl::opt<unsigned> SmallLoopCost(
257     "small-loop-cost", cl::init(20), cl::Hidden,
258     cl::desc(
259         "The cost of a loop that is considered 'small' by the interleaver."));
260 
261 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
262     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
263     cl::desc("Enable the use of the block frequency analysis to access PGO "
264              "heuristics minimizing code growth in cold regions and being more "
265              "aggressive in hot regions."));
266 
267 // Runtime interleave loops for load/store throughput.
268 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
269     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
270     cl::desc(
271         "Enable runtime interleaving until load/store ports are saturated"));
272 
273 /// The number of stores in a loop that are allowed to need predication.
274 static cl::opt<unsigned> NumberOfStoresToPredicate(
275     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
276     cl::desc("Max number of stores to be predicated behind an if."));
277 
278 static cl::opt<bool> EnableIndVarRegisterHeur(
279     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
280     cl::desc("Count the induction variable only once when interleaving"));
281 
282 static cl::opt<bool> EnableCondStoresVectorization(
283     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
284     cl::desc("Enable if predication of stores during vectorization."));
285 
286 static cl::opt<unsigned> MaxNestedScalarReductionIC(
287     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
288     cl::desc("The maximum interleave count to use when interleaving a scalar "
289              "reduction in a nested loop."));
290 
291 static cl::opt<bool>
292     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
293                            cl::Hidden,
294                            cl::desc("Prefer in-loop vector reductions, "
295                                     "overriding the targets preference."));
296 
297 static cl::opt<bool> PreferPredicatedReductionSelect(
298     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
299     cl::desc(
300         "Prefer predicating a reduction operation over an after loop select."));
301 
302 cl::opt<bool> EnableVPlanNativePath(
303     "enable-vplan-native-path", cl::init(false), cl::Hidden,
304     cl::desc("Enable VPlan-native vectorization path with "
305              "support for outer loop vectorization."));
306 
307 // FIXME: Remove this switch once we have divergence analysis. Currently we
308 // assume divergent non-backedge branches when this switch is true.
309 cl::opt<bool> EnableVPlanPredication(
310     "enable-vplan-predication", cl::init(false), cl::Hidden,
311     cl::desc("Enable VPlan-native vectorization path predicator with "
312              "support for outer loop vectorization."));
313 
314 // This flag enables the stress testing of the VPlan H-CFG construction in the
315 // VPlan-native vectorization path. It must be used in conjuction with
316 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
317 // verification of the H-CFGs built.
318 static cl::opt<bool> VPlanBuildStressTest(
319     "vplan-build-stress-test", cl::init(false), cl::Hidden,
320     cl::desc(
321         "Build VPlan for every supported loop nest in the function and bail "
322         "out right after the build (stress test the VPlan H-CFG construction "
323         "in the VPlan-native vectorization path)."));
324 
325 cl::opt<bool> llvm::EnableLoopInterleaving(
326     "interleave-loops", cl::init(true), cl::Hidden,
327     cl::desc("Enable loop interleaving in Loop vectorization passes"));
328 cl::opt<bool> llvm::EnableLoopVectorization(
329     "vectorize-loops", cl::init(true), cl::Hidden,
330     cl::desc("Run the Loop vectorization passes"));
331 
332 /// A helper function that returns the type of loaded or stored value.
333 static Type *getMemInstValueType(Value *I) {
334   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
335          "Expected Load or Store instruction");
336   if (auto *LI = dyn_cast<LoadInst>(I))
337     return LI->getType();
338   return cast<StoreInst>(I)->getValueOperand()->getType();
339 }
340 
341 /// A helper function that returns true if the given type is irregular. The
342 /// type is irregular if its allocated size doesn't equal the store size of an
343 /// element of the corresponding vector type at the given vectorization factor.
344 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
345   assert(!VF.isScalable() && "scalable vectors not yet supported.");
346   // Determine if an array of VF elements of type Ty is "bitcast compatible"
347   // with a <VF x Ty> vector.
348   if (VF.isVector()) {
349     auto *VectorTy = VectorType::get(Ty, VF);
350     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
351   }
352 
353   // If the vectorization factor is one, we just check if an array of type Ty
354   // requires padding between elements.
355   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
356 }
357 
358 /// A helper function that returns the reciprocal of the block probability of
359 /// predicated blocks. If we return X, we are assuming the predicated block
360 /// will execute once for every X iterations of the loop header.
361 ///
362 /// TODO: We should use actual block probability here, if available. Currently,
363 ///       we always assume predicated blocks have a 50% chance of executing.
364 static unsigned getReciprocalPredBlockProb() { return 2; }
365 
366 /// A helper function that adds a 'fast' flag to floating-point operations.
367 static Value *addFastMathFlag(Value *V) {
368   if (isa<FPMathOperator>(V))
369     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
370   return V;
371 }
372 
373 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
374   if (isa<FPMathOperator>(V))
375     cast<Instruction>(V)->setFastMathFlags(FMF);
376   return V;
377 }
378 
379 /// A helper function that returns an integer or floating-point constant with
380 /// value C.
381 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
382   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
383                            : ConstantFP::get(Ty, C);
384 }
385 
386 /// Returns "best known" trip count for the specified loop \p L as defined by
387 /// the following procedure:
388 ///   1) Returns exact trip count if it is known.
389 ///   2) Returns expected trip count according to profile data if any.
390 ///   3) Returns upper bound estimate if it is known.
391 ///   4) Returns None if all of the above failed.
392 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
393   // Check if exact trip count is known.
394   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
395     return ExpectedTC;
396 
397   // Check if there is an expected trip count available from profile data.
398   if (LoopVectorizeWithBlockFrequency)
399     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
400       return EstimatedTC;
401 
402   // Check if upper bound estimate is known.
403   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
404     return ExpectedTC;
405 
406   return None;
407 }
408 
409 namespace llvm {
410 
411 /// InnerLoopVectorizer vectorizes loops which contain only one basic
412 /// block to a specified vectorization factor (VF).
413 /// This class performs the widening of scalars into vectors, or multiple
414 /// scalars. This class also implements the following features:
415 /// * It inserts an epilogue loop for handling loops that don't have iteration
416 ///   counts that are known to be a multiple of the vectorization factor.
417 /// * It handles the code generation for reduction variables.
418 /// * Scalarization (implementation using scalars) of un-vectorizable
419 ///   instructions.
420 /// InnerLoopVectorizer does not perform any vectorization-legality
421 /// checks, and relies on the caller to check for the different legality
422 /// aspects. The InnerLoopVectorizer relies on the
423 /// LoopVectorizationLegality class to provide information about the induction
424 /// and reduction variables that were found to a given vectorization factor.
425 class InnerLoopVectorizer {
426 public:
427   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
428                       LoopInfo *LI, DominatorTree *DT,
429                       const TargetLibraryInfo *TLI,
430                       const TargetTransformInfo *TTI, AssumptionCache *AC,
431                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
432                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
433                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
434                       ProfileSummaryInfo *PSI)
435       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
436         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
437         Builder(PSE.getSE()->getContext()),
438         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
439         BFI(BFI), PSI(PSI) {
440     // Query this against the original loop and save it here because the profile
441     // of the original loop header may change as the transformation happens.
442     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
443         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
444   }
445 
446   virtual ~InnerLoopVectorizer() = default;
447 
448   /// Create a new empty loop that will contain vectorized instructions later
449   /// on, while the old loop will be used as the scalar remainder. Control flow
450   /// is generated around the vectorized (and scalar epilogue) loops consisting
451   /// of various checks and bypasses. Return the pre-header block of the new
452   /// loop.
453   BasicBlock *createVectorizedLoopSkeleton();
454 
455   /// Widen a single instruction within the innermost loop.
456   void widenInstruction(Instruction &I, VPUser &Operands,
457                         VPTransformState &State);
458 
459   /// Widen a single call instruction within the innermost loop.
460   void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
461                             VPTransformState &State);
462 
463   /// Widen a single select instruction within the innermost loop.
464   void widenSelectInstruction(SelectInst &I, VPUser &Operands,
465                               bool InvariantCond, VPTransformState &State);
466 
467   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
468   void fixVectorizedLoop();
469 
470   // Return true if any runtime check is added.
471   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
472 
473   /// A type for vectorized values in the new loop. Each value from the
474   /// original loop, when vectorized, is represented by UF vector values in the
475   /// new unrolled loop, where UF is the unroll factor.
476   using VectorParts = SmallVector<Value *, 2>;
477 
478   /// Vectorize a single GetElementPtrInst based on information gathered and
479   /// decisions taken during planning.
480   void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
481                 ElementCount VF, bool IsPtrLoopInvariant,
482                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
483 
484   /// Vectorize a single PHINode in a block. This method handles the induction
485   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
486   /// arbitrary length vectors.
487   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
488 
489   /// A helper function to scalarize a single Instruction in the innermost loop.
490   /// Generates a sequence of scalar instances for each lane between \p MinLane
491   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
492   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
493   /// Instr's operands.
494   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
495                             const VPIteration &Instance, bool IfPredicateInstr,
496                             VPTransformState &State);
497 
498   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
499   /// is provided, the integer induction variable will first be truncated to
500   /// the corresponding type.
501   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
502 
503   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
504   /// vector or scalar value on-demand if one is not yet available. When
505   /// vectorizing a loop, we visit the definition of an instruction before its
506   /// uses. When visiting the definition, we either vectorize or scalarize the
507   /// instruction, creating an entry for it in the corresponding map. (In some
508   /// cases, such as induction variables, we will create both vector and scalar
509   /// entries.) Then, as we encounter uses of the definition, we derive values
510   /// for each scalar or vector use unless such a value is already available.
511   /// For example, if we scalarize a definition and one of its uses is vector,
512   /// we build the required vector on-demand with an insertelement sequence
513   /// when visiting the use. Otherwise, if the use is scalar, we can use the
514   /// existing scalar definition.
515   ///
516   /// Return a value in the new loop corresponding to \p V from the original
517   /// loop at unroll index \p Part. If the value has already been vectorized,
518   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
519   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
520   /// a new vector value on-demand by inserting the scalar values into a vector
521   /// with an insertelement sequence. If the value has been neither vectorized
522   /// nor scalarized, it must be loop invariant, so we simply broadcast the
523   /// value into a vector.
524   Value *getOrCreateVectorValue(Value *V, unsigned Part);
525 
526   /// Return a value in the new loop corresponding to \p V from the original
527   /// loop at unroll and vector indices \p Instance. If the value has been
528   /// vectorized but not scalarized, the necessary extractelement instruction
529   /// will be generated.
530   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
531 
532   /// Construct the vector value of a scalarized value \p V one lane at a time.
533   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
534 
535   /// Try to vectorize interleaved access group \p Group with the base address
536   /// given in \p Addr, optionally masking the vector operations if \p
537   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
538   /// values in the vectorized loop.
539   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
540                                 VPTransformState &State, VPValue *Addr,
541                                 VPValue *BlockInMask = nullptr);
542 
543   /// Vectorize Load and Store instructions with the base address given in \p
544   /// Addr, optionally masking the vector operations if \p BlockInMask is
545   /// non-null. Use \p State to translate given VPValues to IR values in the
546   /// vectorized loop.
547   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
548                                   VPValue *Addr, VPValue *StoredValue,
549                                   VPValue *BlockInMask);
550 
551   /// Set the debug location in the builder using the debug location in
552   /// the instruction.
553   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
554 
555   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
556   void fixNonInductionPHIs(void);
557 
558 protected:
559   friend class LoopVectorizationPlanner;
560 
561   /// A small list of PHINodes.
562   using PhiVector = SmallVector<PHINode *, 4>;
563 
564   /// A type for scalarized values in the new loop. Each value from the
565   /// original loop, when scalarized, is represented by UF x VF scalar values
566   /// in the new unrolled loop, where UF is the unroll factor and VF is the
567   /// vectorization factor.
568   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
569 
570   /// Set up the values of the IVs correctly when exiting the vector loop.
571   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
572                     Value *CountRoundDown, Value *EndValue,
573                     BasicBlock *MiddleBlock);
574 
575   /// Create a new induction variable inside L.
576   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
577                                    Value *Step, Instruction *DL);
578 
579   /// Handle all cross-iteration phis in the header.
580   void fixCrossIterationPHIs();
581 
582   /// Fix a first-order recurrence. This is the second phase of vectorizing
583   /// this phi node.
584   void fixFirstOrderRecurrence(PHINode *Phi);
585 
586   /// Fix a reduction cross-iteration phi. This is the second phase of
587   /// vectorizing this phi node.
588   void fixReduction(PHINode *Phi);
589 
590   /// Clear NSW/NUW flags from reduction instructions if necessary.
591   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
592 
593   /// The Loop exit block may have single value PHI nodes with some
594   /// incoming value. While vectorizing we only handled real values
595   /// that were defined inside the loop and we should have one value for
596   /// each predecessor of its parent basic block. See PR14725.
597   void fixLCSSAPHIs();
598 
599   /// Iteratively sink the scalarized operands of a predicated instruction into
600   /// the block that was created for it.
601   void sinkScalarOperands(Instruction *PredInst);
602 
603   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
604   /// represented as.
605   void truncateToMinimalBitwidths();
606 
607   /// Create a broadcast instruction. This method generates a broadcast
608   /// instruction (shuffle) for loop invariant values and for the induction
609   /// value. If this is the induction variable then we extend it to N, N+1, ...
610   /// this is needed because each iteration in the loop corresponds to a SIMD
611   /// element.
612   virtual Value *getBroadcastInstrs(Value *V);
613 
614   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
615   /// to each vector element of Val. The sequence starts at StartIndex.
616   /// \p Opcode is relevant for FP induction variable.
617   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
618                                Instruction::BinaryOps Opcode =
619                                Instruction::BinaryOpsEnd);
620 
621   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
622   /// variable on which to base the steps, \p Step is the size of the step, and
623   /// \p EntryVal is the value from the original loop that maps to the steps.
624   /// Note that \p EntryVal doesn't have to be an induction variable - it
625   /// can also be a truncate instruction.
626   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
627                         const InductionDescriptor &ID);
628 
629   /// Create a vector induction phi node based on an existing scalar one. \p
630   /// EntryVal is the value from the original loop that maps to the vector phi
631   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
632   /// truncate instruction, instead of widening the original IV, we widen a
633   /// version of the IV truncated to \p EntryVal's type.
634   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
635                                        Value *Step, Instruction *EntryVal);
636 
637   /// Returns true if an instruction \p I should be scalarized instead of
638   /// vectorized for the chosen vectorization factor.
639   bool shouldScalarizeInstruction(Instruction *I) const;
640 
641   /// Returns true if we should generate a scalar version of \p IV.
642   bool needsScalarInduction(Instruction *IV) const;
643 
644   /// If there is a cast involved in the induction variable \p ID, which should
645   /// be ignored in the vectorized loop body, this function records the
646   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
647   /// cast. We had already proved that the casted Phi is equal to the uncasted
648   /// Phi in the vectorized loop (under a runtime guard), and therefore
649   /// there is no need to vectorize the cast - the same value can be used in the
650   /// vector loop for both the Phi and the cast.
651   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
652   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
653   ///
654   /// \p EntryVal is the value from the original loop that maps to the vector
655   /// phi node and is used to distinguish what is the IV currently being
656   /// processed - original one (if \p EntryVal is a phi corresponding to the
657   /// original IV) or the "newly-created" one based on the proof mentioned above
658   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
659   /// latter case \p EntryVal is a TruncInst and we must not record anything for
660   /// that IV, but it's error-prone to expect callers of this routine to care
661   /// about that, hence this explicit parameter.
662   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
663                                              const Instruction *EntryVal,
664                                              Value *VectorLoopValue,
665                                              unsigned Part,
666                                              unsigned Lane = UINT_MAX);
667 
668   /// Generate a shuffle sequence that will reverse the vector Vec.
669   virtual Value *reverseVector(Value *Vec);
670 
671   /// Returns (and creates if needed) the original loop trip count.
672   Value *getOrCreateTripCount(Loop *NewLoop);
673 
674   /// Returns (and creates if needed) the trip count of the widened loop.
675   Value *getOrCreateVectorTripCount(Loop *NewLoop);
676 
677   /// Returns a bitcasted value to the requested vector type.
678   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
679   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
680                                 const DataLayout &DL);
681 
682   /// Emit a bypass check to see if the vector trip count is zero, including if
683   /// it overflows.
684   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
685 
686   /// Emit a bypass check to see if all of the SCEV assumptions we've
687   /// had to make are correct.
688   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
689 
690   /// Emit bypass checks to check any memory assumptions we may have made.
691   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
692 
693   /// Compute the transformed value of Index at offset StartValue using step
694   /// StepValue.
695   /// For integer induction, returns StartValue + Index * StepValue.
696   /// For pointer induction, returns StartValue[Index * StepValue].
697   /// FIXME: The newly created binary instructions should contain nsw/nuw
698   /// flags, which can be found from the original scalar operations.
699   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
700                               const DataLayout &DL,
701                               const InductionDescriptor &ID) const;
702 
703   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
704   /// vector loop preheader, middle block and scalar preheader. Also
705   /// allocate a loop object for the new vector loop and return it.
706   Loop *createVectorLoopSkeleton(StringRef Prefix);
707 
708   /// Create new phi nodes for the induction variables to resume iteration count
709   /// in the scalar epilogue, from where the vectorized loop left off (given by
710   /// \p VectorTripCount).
711   void createInductionResumeValues(Loop *L, Value *VectorTripCount);
712 
713   /// Complete the loop skeleton by adding debug MDs, creating appropriate
714   /// conditional branches in the middle block, preparing the builder and
715   /// running the verifier. Take in the vector loop \p L as argument, and return
716   /// the preheader of the completed vector loop.
717   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
718 
719   /// Add additional metadata to \p To that was not present on \p Orig.
720   ///
721   /// Currently this is used to add the noalias annotations based on the
722   /// inserted memchecks.  Use this for instructions that are *cloned* into the
723   /// vector loop.
724   void addNewMetadata(Instruction *To, const Instruction *Orig);
725 
726   /// Add metadata from one instruction to another.
727   ///
728   /// This includes both the original MDs from \p From and additional ones (\see
729   /// addNewMetadata).  Use this for *newly created* instructions in the vector
730   /// loop.
731   void addMetadata(Instruction *To, Instruction *From);
732 
733   /// Similar to the previous function but it adds the metadata to a
734   /// vector of instructions.
735   void addMetadata(ArrayRef<Value *> To, Instruction *From);
736 
737   /// The original loop.
738   Loop *OrigLoop;
739 
740   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
741   /// dynamic knowledge to simplify SCEV expressions and converts them to a
742   /// more usable form.
743   PredicatedScalarEvolution &PSE;
744 
745   /// Loop Info.
746   LoopInfo *LI;
747 
748   /// Dominator Tree.
749   DominatorTree *DT;
750 
751   /// Alias Analysis.
752   AAResults *AA;
753 
754   /// Target Library Info.
755   const TargetLibraryInfo *TLI;
756 
757   /// Target Transform Info.
758   const TargetTransformInfo *TTI;
759 
760   /// Assumption Cache.
761   AssumptionCache *AC;
762 
763   /// Interface to emit optimization remarks.
764   OptimizationRemarkEmitter *ORE;
765 
766   /// LoopVersioning.  It's only set up (non-null) if memchecks were
767   /// used.
768   ///
769   /// This is currently only used to add no-alias metadata based on the
770   /// memchecks.  The actually versioning is performed manually.
771   std::unique_ptr<LoopVersioning> LVer;
772 
773   /// The vectorization SIMD factor to use. Each vector will have this many
774   /// vector elements.
775   ElementCount VF;
776 
777   /// The vectorization unroll factor to use. Each scalar is vectorized to this
778   /// many different vector instructions.
779   unsigned UF;
780 
781   /// The builder that we use
782   IRBuilder<> Builder;
783 
784   // --- Vectorization state ---
785 
786   /// The vector-loop preheader.
787   BasicBlock *LoopVectorPreHeader;
788 
789   /// The scalar-loop preheader.
790   BasicBlock *LoopScalarPreHeader;
791 
792   /// Middle Block between the vector and the scalar.
793   BasicBlock *LoopMiddleBlock;
794 
795   /// The ExitBlock of the scalar loop.
796   BasicBlock *LoopExitBlock;
797 
798   /// The vector loop body.
799   BasicBlock *LoopVectorBody;
800 
801   /// The scalar loop body.
802   BasicBlock *LoopScalarBody;
803 
804   /// A list of all bypass blocks. The first block is the entry of the loop.
805   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
806 
807   /// The new Induction variable which was added to the new block.
808   PHINode *Induction = nullptr;
809 
810   /// The induction variable of the old basic block.
811   PHINode *OldInduction = nullptr;
812 
813   /// Maps values from the original loop to their corresponding values in the
814   /// vectorized loop. A key value can map to either vector values, scalar
815   /// values or both kinds of values, depending on whether the key was
816   /// vectorized and scalarized.
817   VectorizerValueMap VectorLoopValueMap;
818 
819   /// Store instructions that were predicated.
820   SmallVector<Instruction *, 4> PredicatedInstructions;
821 
822   /// Trip count of the original loop.
823   Value *TripCount = nullptr;
824 
825   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
826   Value *VectorTripCount = nullptr;
827 
828   /// The legality analysis.
829   LoopVectorizationLegality *Legal;
830 
831   /// The profitablity analysis.
832   LoopVectorizationCostModel *Cost;
833 
834   // Record whether runtime checks are added.
835   bool AddedSafetyChecks = false;
836 
837   // Holds the end values for each induction variable. We save the end values
838   // so we can later fix-up the external users of the induction variables.
839   DenseMap<PHINode *, Value *> IVEndValues;
840 
841   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
842   // fixed up at the end of vector code generation.
843   SmallVector<PHINode *, 8> OrigPHIsToFix;
844 
845   /// BFI and PSI are used to check for profile guided size optimizations.
846   BlockFrequencyInfo *BFI;
847   ProfileSummaryInfo *PSI;
848 
849   // Whether this loop should be optimized for size based on profile guided size
850   // optimizatios.
851   bool OptForSizeBasedOnProfile;
852 };
853 
854 class InnerLoopUnroller : public InnerLoopVectorizer {
855 public:
856   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
857                     LoopInfo *LI, DominatorTree *DT,
858                     const TargetLibraryInfo *TLI,
859                     const TargetTransformInfo *TTI, AssumptionCache *AC,
860                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
861                     LoopVectorizationLegality *LVL,
862                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
863                     ProfileSummaryInfo *PSI)
864       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
865                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
866                             BFI, PSI) {}
867 
868 private:
869   Value *getBroadcastInstrs(Value *V) override;
870   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
871                        Instruction::BinaryOps Opcode =
872                        Instruction::BinaryOpsEnd) override;
873   Value *reverseVector(Value *Vec) override;
874 };
875 
876 } // end namespace llvm
877 
878 /// Look for a meaningful debug location on the instruction or it's
879 /// operands.
880 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
881   if (!I)
882     return I;
883 
884   DebugLoc Empty;
885   if (I->getDebugLoc() != Empty)
886     return I;
887 
888   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
889     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
890       if (OpInst->getDebugLoc() != Empty)
891         return OpInst;
892   }
893 
894   return I;
895 }
896 
897 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
898   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
899     const DILocation *DIL = Inst->getDebugLoc();
900     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
901         !isa<DbgInfoIntrinsic>(Inst)) {
902       assert(!VF.isScalable() && "scalable vectors not yet supported.");
903       auto NewDIL =
904           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
905       if (NewDIL)
906         B.SetCurrentDebugLocation(NewDIL.getValue());
907       else
908         LLVM_DEBUG(dbgs()
909                    << "Failed to create new discriminator: "
910                    << DIL->getFilename() << " Line: " << DIL->getLine());
911     }
912     else
913       B.SetCurrentDebugLocation(DIL);
914   } else
915     B.SetCurrentDebugLocation(DebugLoc());
916 }
917 
918 /// Write a record \p DebugMsg about vectorization failure to the debug
919 /// output stream. If \p I is passed, it is an instruction that prevents
920 /// vectorization.
921 #ifndef NDEBUG
922 static void debugVectorizationFailure(const StringRef DebugMsg,
923     Instruction *I) {
924   dbgs() << "LV: Not vectorizing: " << DebugMsg;
925   if (I != nullptr)
926     dbgs() << " " << *I;
927   else
928     dbgs() << '.';
929   dbgs() << '\n';
930 }
931 #endif
932 
933 /// Create an analysis remark that explains why vectorization failed
934 ///
935 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
936 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
937 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
938 /// the location of the remark.  \return the remark object that can be
939 /// streamed to.
940 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
941     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
942   Value *CodeRegion = TheLoop->getHeader();
943   DebugLoc DL = TheLoop->getStartLoc();
944 
945   if (I) {
946     CodeRegion = I->getParent();
947     // If there is no debug location attached to the instruction, revert back to
948     // using the loop's.
949     if (I->getDebugLoc())
950       DL = I->getDebugLoc();
951   }
952 
953   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
954   R << "loop not vectorized: ";
955   return R;
956 }
957 
958 namespace llvm {
959 
960 void reportVectorizationFailure(const StringRef DebugMsg,
961     const StringRef OREMsg, const StringRef ORETag,
962     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
963   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
964   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
965   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
966                 ORETag, TheLoop, I) << OREMsg);
967 }
968 
969 } // end namespace llvm
970 
971 #ifndef NDEBUG
972 /// \return string containing a file name and a line # for the given loop.
973 static std::string getDebugLocString(const Loop *L) {
974   std::string Result;
975   if (L) {
976     raw_string_ostream OS(Result);
977     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
978       LoopDbgLoc.print(OS);
979     else
980       // Just print the module name.
981       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
982     OS.flush();
983   }
984   return Result;
985 }
986 #endif
987 
988 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
989                                          const Instruction *Orig) {
990   // If the loop was versioned with memchecks, add the corresponding no-alias
991   // metadata.
992   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
993     LVer->annotateInstWithNoAlias(To, Orig);
994 }
995 
996 void InnerLoopVectorizer::addMetadata(Instruction *To,
997                                       Instruction *From) {
998   propagateMetadata(To, From);
999   addNewMetadata(To, From);
1000 }
1001 
1002 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1003                                       Instruction *From) {
1004   for (Value *V : To) {
1005     if (Instruction *I = dyn_cast<Instruction>(V))
1006       addMetadata(I, From);
1007   }
1008 }
1009 
1010 namespace llvm {
1011 
1012 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1013 // lowered.
1014 enum ScalarEpilogueLowering {
1015 
1016   // The default: allowing scalar epilogues.
1017   CM_ScalarEpilogueAllowed,
1018 
1019   // Vectorization with OptForSize: don't allow epilogues.
1020   CM_ScalarEpilogueNotAllowedOptSize,
1021 
1022   // A special case of vectorisation with OptForSize: loops with a very small
1023   // trip count are considered for vectorization under OptForSize, thereby
1024   // making sure the cost of their loop body is dominant, free of runtime
1025   // guards and scalar iteration overheads.
1026   CM_ScalarEpilogueNotAllowedLowTripLoop,
1027 
1028   // Loop hint predicate indicating an epilogue is undesired.
1029   CM_ScalarEpilogueNotNeededUsePredicate
1030 };
1031 
1032 /// LoopVectorizationCostModel - estimates the expected speedups due to
1033 /// vectorization.
1034 /// In many cases vectorization is not profitable. This can happen because of
1035 /// a number of reasons. In this class we mainly attempt to predict the
1036 /// expected speedup/slowdowns due to the supported instruction set. We use the
1037 /// TargetTransformInfo to query the different backends for the cost of
1038 /// different operations.
1039 class LoopVectorizationCostModel {
1040 public:
1041   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1042                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1043                              LoopVectorizationLegality *Legal,
1044                              const TargetTransformInfo &TTI,
1045                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1046                              AssumptionCache *AC,
1047                              OptimizationRemarkEmitter *ORE, const Function *F,
1048                              const LoopVectorizeHints *Hints,
1049                              InterleavedAccessInfo &IAI)
1050       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1051         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1052         Hints(Hints), InterleaveInfo(IAI) {}
1053 
1054   /// \return An upper bound for the vectorization factor, or None if
1055   /// vectorization and interleaving should be avoided up front.
1056   Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
1057 
1058   /// \return True if runtime checks are required for vectorization, and false
1059   /// otherwise.
1060   bool runtimeChecksRequired();
1061 
1062   /// \return The most profitable vectorization factor and the cost of that VF.
1063   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1064   /// then this vectorization factor will be selected if vectorization is
1065   /// possible.
1066   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1067 
1068   /// Setup cost-based decisions for user vectorization factor.
1069   void selectUserVectorizationFactor(ElementCount UserVF) {
1070     collectUniformsAndScalars(UserVF);
1071     collectInstsToScalarize(UserVF);
1072   }
1073 
1074   /// \return The size (in bits) of the smallest and widest types in the code
1075   /// that needs to be vectorized. We ignore values that remain scalar such as
1076   /// 64 bit loop indices.
1077   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1078 
1079   /// \return The desired interleave count.
1080   /// If interleave count has been specified by metadata it will be returned.
1081   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1082   /// are the selected vectorization factor and the cost of the selected VF.
1083   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1084 
1085   /// Memory access instruction may be vectorized in more than one way.
1086   /// Form of instruction after vectorization depends on cost.
1087   /// This function takes cost-based decisions for Load/Store instructions
1088   /// and collects them in a map. This decisions map is used for building
1089   /// the lists of loop-uniform and loop-scalar instructions.
1090   /// The calculated cost is saved with widening decision in order to
1091   /// avoid redundant calculations.
1092   void setCostBasedWideningDecision(ElementCount VF);
1093 
1094   /// A struct that represents some properties of the register usage
1095   /// of a loop.
1096   struct RegisterUsage {
1097     /// Holds the number of loop invariant values that are used in the loop.
1098     /// The key is ClassID of target-provided register class.
1099     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1100     /// Holds the maximum number of concurrent live intervals in the loop.
1101     /// The key is ClassID of target-provided register class.
1102     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1103   };
1104 
1105   /// \return Returns information about the register usages of the loop for the
1106   /// given vectorization factors.
1107   SmallVector<RegisterUsage, 8>
1108   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1109 
1110   /// Collect values we want to ignore in the cost model.
1111   void collectValuesToIgnore();
1112 
1113   /// Split reductions into those that happen in the loop, and those that happen
1114   /// outside. In loop reductions are collected into InLoopReductionChains.
1115   void collectInLoopReductions();
1116 
1117   /// \returns The smallest bitwidth each instruction can be represented with.
1118   /// The vector equivalents of these instructions should be truncated to this
1119   /// type.
1120   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1121     return MinBWs;
1122   }
1123 
1124   /// \returns True if it is more profitable to scalarize instruction \p I for
1125   /// vectorization factor \p VF.
1126   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1127     assert(VF.isVector() &&
1128            "Profitable to scalarize relevant only for VF > 1.");
1129 
1130     // Cost model is not run in the VPlan-native path - return conservative
1131     // result until this changes.
1132     if (EnableVPlanNativePath)
1133       return false;
1134 
1135     auto Scalars = InstsToScalarize.find(VF);
1136     assert(Scalars != InstsToScalarize.end() &&
1137            "VF not yet analyzed for scalarization profitability");
1138     return Scalars->second.find(I) != Scalars->second.end();
1139   }
1140 
1141   /// Returns true if \p I is known to be uniform after vectorization.
1142   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1143     if (VF.isScalar())
1144       return true;
1145 
1146     // Cost model is not run in the VPlan-native path - return conservative
1147     // result until this changes.
1148     if (EnableVPlanNativePath)
1149       return false;
1150 
1151     auto UniformsPerVF = Uniforms.find(VF);
1152     assert(UniformsPerVF != Uniforms.end() &&
1153            "VF not yet analyzed for uniformity");
1154     return UniformsPerVF->second.count(I);
1155   }
1156 
1157   /// Returns true if \p I is known to be scalar after vectorization.
1158   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1159     if (VF.isScalar())
1160       return true;
1161 
1162     // Cost model is not run in the VPlan-native path - return conservative
1163     // result until this changes.
1164     if (EnableVPlanNativePath)
1165       return false;
1166 
1167     auto ScalarsPerVF = Scalars.find(VF);
1168     assert(ScalarsPerVF != Scalars.end() &&
1169            "Scalar values are not calculated for VF");
1170     return ScalarsPerVF->second.count(I);
1171   }
1172 
1173   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1174   /// for vectorization factor \p VF.
1175   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1176     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1177            !isProfitableToScalarize(I, VF) &&
1178            !isScalarAfterVectorization(I, VF);
1179   }
1180 
1181   /// Decision that was taken during cost calculation for memory instruction.
1182   enum InstWidening {
1183     CM_Unknown,
1184     CM_Widen,         // For consecutive accesses with stride +1.
1185     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1186     CM_Interleave,
1187     CM_GatherScatter,
1188     CM_Scalarize
1189   };
1190 
1191   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1192   /// instruction \p I and vector width \p VF.
1193   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1194                            unsigned Cost) {
1195     assert(VF.isVector() && "Expected VF >=2");
1196     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1197   }
1198 
1199   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1200   /// interleaving group \p Grp and vector width \p VF.
1201   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1202                            ElementCount VF, InstWidening W, unsigned Cost) {
1203     assert(VF.isVector() && "Expected VF >=2");
1204     /// Broadcast this decicion to all instructions inside the group.
1205     /// But the cost will be assigned to one instruction only.
1206     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1207       if (auto *I = Grp->getMember(i)) {
1208         if (Grp->getInsertPos() == I)
1209           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1210         else
1211           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1212       }
1213     }
1214   }
1215 
1216   /// Return the cost model decision for the given instruction \p I and vector
1217   /// width \p VF. Return CM_Unknown if this instruction did not pass
1218   /// through the cost modeling.
1219   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1220     assert(!VF.isScalable() && "scalable vectors not yet supported.");
1221     assert(VF.isVector() && "Expected VF >=2");
1222 
1223     // Cost model is not run in the VPlan-native path - return conservative
1224     // result until this changes.
1225     if (EnableVPlanNativePath)
1226       return CM_GatherScatter;
1227 
1228     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1229     auto Itr = WideningDecisions.find(InstOnVF);
1230     if (Itr == WideningDecisions.end())
1231       return CM_Unknown;
1232     return Itr->second.first;
1233   }
1234 
1235   /// Return the vectorization cost for the given instruction \p I and vector
1236   /// width \p VF.
1237   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1238     assert(VF.isVector() && "Expected VF >=2");
1239     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1240     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1241            "The cost is not calculated");
1242     return WideningDecisions[InstOnVF].second;
1243   }
1244 
1245   /// Return True if instruction \p I is an optimizable truncate whose operand
1246   /// is an induction variable. Such a truncate will be removed by adding a new
1247   /// induction variable with the destination type.
1248   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1249     // If the instruction is not a truncate, return false.
1250     auto *Trunc = dyn_cast<TruncInst>(I);
1251     if (!Trunc)
1252       return false;
1253 
1254     // Get the source and destination types of the truncate.
1255     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1256     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1257 
1258     // If the truncate is free for the given types, return false. Replacing a
1259     // free truncate with an induction variable would add an induction variable
1260     // update instruction to each iteration of the loop. We exclude from this
1261     // check the primary induction variable since it will need an update
1262     // instruction regardless.
1263     Value *Op = Trunc->getOperand(0);
1264     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1265       return false;
1266 
1267     // If the truncated value is not an induction variable, return false.
1268     return Legal->isInductionPhi(Op);
1269   }
1270 
1271   /// Collects the instructions to scalarize for each predicated instruction in
1272   /// the loop.
1273   void collectInstsToScalarize(ElementCount VF);
1274 
1275   /// Collect Uniform and Scalar values for the given \p VF.
1276   /// The sets depend on CM decision for Load/Store instructions
1277   /// that may be vectorized as interleave, gather-scatter or scalarized.
1278   void collectUniformsAndScalars(ElementCount VF) {
1279     // Do the analysis once.
1280     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1281       return;
1282     setCostBasedWideningDecision(VF);
1283     collectLoopUniforms(VF);
1284     collectLoopScalars(VF);
1285   }
1286 
1287   /// Returns true if the target machine supports masked store operation
1288   /// for the given \p DataType and kind of access to \p Ptr.
1289   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1290     return Legal->isConsecutivePtr(Ptr) &&
1291            TTI.isLegalMaskedStore(DataType, Alignment);
1292   }
1293 
1294   /// Returns true if the target machine supports masked load operation
1295   /// for the given \p DataType and kind of access to \p Ptr.
1296   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1297     return Legal->isConsecutivePtr(Ptr) &&
1298            TTI.isLegalMaskedLoad(DataType, Alignment);
1299   }
1300 
1301   /// Returns true if the target machine supports masked scatter operation
1302   /// for the given \p DataType.
1303   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1304     return TTI.isLegalMaskedScatter(DataType, Alignment);
1305   }
1306 
1307   /// Returns true if the target machine supports masked gather operation
1308   /// for the given \p DataType.
1309   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1310     return TTI.isLegalMaskedGather(DataType, Alignment);
1311   }
1312 
1313   /// Returns true if the target machine can represent \p V as a masked gather
1314   /// or scatter operation.
1315   bool isLegalGatherOrScatter(Value *V) {
1316     bool LI = isa<LoadInst>(V);
1317     bool SI = isa<StoreInst>(V);
1318     if (!LI && !SI)
1319       return false;
1320     auto *Ty = getMemInstValueType(V);
1321     Align Align = getLoadStoreAlignment(V);
1322     return (LI && isLegalMaskedGather(Ty, Align)) ||
1323            (SI && isLegalMaskedScatter(Ty, Align));
1324   }
1325 
1326   /// Returns true if \p I is an instruction that will be scalarized with
1327   /// predication. Such instructions include conditional stores and
1328   /// instructions that may divide by zero.
1329   /// If a non-zero VF has been calculated, we check if I will be scalarized
1330   /// predication for that VF.
1331   bool isScalarWithPredication(Instruction *I,
1332                                ElementCount VF = ElementCount::getFixed(1));
1333 
1334   // Returns true if \p I is an instruction that will be predicated either
1335   // through scalar predication or masked load/store or masked gather/scatter.
1336   // Superset of instructions that return true for isScalarWithPredication.
1337   bool isPredicatedInst(Instruction *I) {
1338     if (!blockNeedsPredication(I->getParent()))
1339       return false;
1340     // Loads and stores that need some form of masked operation are predicated
1341     // instructions.
1342     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1343       return Legal->isMaskRequired(I);
1344     return isScalarWithPredication(I);
1345   }
1346 
1347   /// Returns true if \p I is a memory instruction with consecutive memory
1348   /// access that can be widened.
1349   bool
1350   memoryInstructionCanBeWidened(Instruction *I,
1351                                 ElementCount VF = ElementCount::getFixed(1));
1352 
1353   /// Returns true if \p I is a memory instruction in an interleaved-group
1354   /// of memory accesses that can be vectorized with wide vector loads/stores
1355   /// and shuffles.
1356   bool
1357   interleavedAccessCanBeWidened(Instruction *I,
1358                                 ElementCount VF = ElementCount::getFixed(1));
1359 
1360   /// Check if \p Instr belongs to any interleaved access group.
1361   bool isAccessInterleaved(Instruction *Instr) {
1362     return InterleaveInfo.isInterleaved(Instr);
1363   }
1364 
1365   /// Get the interleaved access group that \p Instr belongs to.
1366   const InterleaveGroup<Instruction> *
1367   getInterleavedAccessGroup(Instruction *Instr) {
1368     return InterleaveInfo.getInterleaveGroup(Instr);
1369   }
1370 
1371   /// Returns true if an interleaved group requires a scalar iteration
1372   /// to handle accesses with gaps, and there is nothing preventing us from
1373   /// creating a scalar epilogue.
1374   bool requiresScalarEpilogue() const {
1375     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1376   }
1377 
1378   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1379   /// loop hint annotation.
1380   bool isScalarEpilogueAllowed() const {
1381     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1382   }
1383 
1384   /// Returns true if all loop blocks should be masked to fold tail loop.
1385   bool foldTailByMasking() const { return FoldTailByMasking; }
1386 
1387   bool blockNeedsPredication(BasicBlock *BB) {
1388     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1389   }
1390 
1391   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1392   /// nodes to the chain of instructions representing the reductions. Uses a
1393   /// MapVector to ensure deterministic iteration order.
1394   using ReductionChainMap =
1395       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1396 
1397   /// Return the chain of instructions representing an inloop reduction.
1398   const ReductionChainMap &getInLoopReductionChains() const {
1399     return InLoopReductionChains;
1400   }
1401 
1402   /// Returns true if the Phi is part of an inloop reduction.
1403   bool isInLoopReduction(PHINode *Phi) const {
1404     return InLoopReductionChains.count(Phi);
1405   }
1406 
1407   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1408   /// with factor VF.  Return the cost of the instruction, including
1409   /// scalarization overhead if it's needed.
1410   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1411 
1412   /// Estimate cost of a call instruction CI if it were vectorized with factor
1413   /// VF. Return the cost of the instruction, including scalarization overhead
1414   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1415   /// scalarized -
1416   /// i.e. either vector version isn't available, or is too expensive.
1417   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1418                              bool &NeedToScalarize);
1419 
1420   /// Invalidates decisions already taken by the cost model.
1421   void invalidateCostModelingDecisions() {
1422     WideningDecisions.clear();
1423     Uniforms.clear();
1424     Scalars.clear();
1425   }
1426 
1427 private:
1428   unsigned NumPredStores = 0;
1429 
1430   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1431   /// than zero. One is returned if vectorization should best be avoided due
1432   /// to cost.
1433   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1434 
1435   /// The vectorization cost is a combination of the cost itself and a boolean
1436   /// indicating whether any of the contributing operations will actually
1437   /// operate on
1438   /// vector values after type legalization in the backend. If this latter value
1439   /// is
1440   /// false, then all operations will be scalarized (i.e. no vectorization has
1441   /// actually taken place).
1442   using VectorizationCostTy = std::pair<unsigned, bool>;
1443 
1444   /// Returns the expected execution cost. The unit of the cost does
1445   /// not matter because we use the 'cost' units to compare different
1446   /// vector widths. The cost that is returned is *not* normalized by
1447   /// the factor width.
1448   VectorizationCostTy expectedCost(ElementCount VF);
1449 
1450   /// Returns the execution time cost of an instruction for a given vector
1451   /// width. Vector width of one means scalar.
1452   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1453 
1454   /// The cost-computation logic from getInstructionCost which provides
1455   /// the vector type as an output parameter.
1456   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1457 
1458   /// Calculate vectorization cost of memory instruction \p I.
1459   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1460 
1461   /// The cost computation for scalarized memory instruction.
1462   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1463 
1464   /// The cost computation for interleaving group of memory instructions.
1465   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1466 
1467   /// The cost computation for Gather/Scatter instruction.
1468   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1469 
1470   /// The cost computation for widening instruction \p I with consecutive
1471   /// memory access.
1472   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1473 
1474   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1475   /// Load: scalar load + broadcast.
1476   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1477   /// element)
1478   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1479 
1480   /// Estimate the overhead of scalarizing an instruction. This is a
1481   /// convenience wrapper for the type-based getScalarizationOverhead API.
1482   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1483 
1484   /// Returns whether the instruction is a load or store and will be a emitted
1485   /// as a vector operation.
1486   bool isConsecutiveLoadOrStore(Instruction *I);
1487 
1488   /// Returns true if an artificially high cost for emulated masked memrefs
1489   /// should be used.
1490   bool useEmulatedMaskMemRefHack(Instruction *I);
1491 
1492   /// Map of scalar integer values to the smallest bitwidth they can be legally
1493   /// represented as. The vector equivalents of these values should be truncated
1494   /// to this type.
1495   MapVector<Instruction *, uint64_t> MinBWs;
1496 
1497   /// A type representing the costs for instructions if they were to be
1498   /// scalarized rather than vectorized. The entries are Instruction-Cost
1499   /// pairs.
1500   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1501 
1502   /// A set containing all BasicBlocks that are known to present after
1503   /// vectorization as a predicated block.
1504   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1505 
1506   /// Records whether it is allowed to have the original scalar loop execute at
1507   /// least once. This may be needed as a fallback loop in case runtime
1508   /// aliasing/dependence checks fail, or to handle the tail/remainder
1509   /// iterations when the trip count is unknown or doesn't divide by the VF,
1510   /// or as a peel-loop to handle gaps in interleave-groups.
1511   /// Under optsize and when the trip count is very small we don't allow any
1512   /// iterations to execute in the scalar loop.
1513   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1514 
1515   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1516   bool FoldTailByMasking = false;
1517 
1518   /// A map holding scalar costs for different vectorization factors. The
1519   /// presence of a cost for an instruction in the mapping indicates that the
1520   /// instruction will be scalarized when vectorizing with the associated
1521   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1522   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1523 
1524   /// Holds the instructions known to be uniform after vectorization.
1525   /// The data is collected per VF.
1526   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1527 
1528   /// Holds the instructions known to be scalar after vectorization.
1529   /// The data is collected per VF.
1530   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1531 
1532   /// Holds the instructions (address computations) that are forced to be
1533   /// scalarized.
1534   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1535 
1536   /// PHINodes of the reductions that should be expanded in-loop along with
1537   /// their associated chains of reduction operations, in program order from top
1538   /// (PHI) to bottom
1539   ReductionChainMap InLoopReductionChains;
1540 
1541   /// Returns the expected difference in cost from scalarizing the expression
1542   /// feeding a predicated instruction \p PredInst. The instructions to
1543   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1544   /// non-negative return value implies the expression will be scalarized.
1545   /// Currently, only single-use chains are considered for scalarization.
1546   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1547                               ElementCount VF);
1548 
1549   /// Collect the instructions that are uniform after vectorization. An
1550   /// instruction is uniform if we represent it with a single scalar value in
1551   /// the vectorized loop corresponding to each vector iteration. Examples of
1552   /// uniform instructions include pointer operands of consecutive or
1553   /// interleaved memory accesses. Note that although uniformity implies an
1554   /// instruction will be scalar, the reverse is not true. In general, a
1555   /// scalarized instruction will be represented by VF scalar values in the
1556   /// vectorized loop, each corresponding to an iteration of the original
1557   /// scalar loop.
1558   void collectLoopUniforms(ElementCount VF);
1559 
1560   /// Collect the instructions that are scalar after vectorization. An
1561   /// instruction is scalar if it is known to be uniform or will be scalarized
1562   /// during vectorization. Non-uniform scalarized instructions will be
1563   /// represented by VF values in the vectorized loop, each corresponding to an
1564   /// iteration of the original scalar loop.
1565   void collectLoopScalars(ElementCount VF);
1566 
1567   /// Keeps cost model vectorization decision and cost for instructions.
1568   /// Right now it is used for memory instructions only.
1569   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1570                                 std::pair<InstWidening, unsigned>>;
1571 
1572   DecisionList WideningDecisions;
1573 
1574   /// Returns true if \p V is expected to be vectorized and it needs to be
1575   /// extracted.
1576   bool needsExtract(Value *V, ElementCount VF) const {
1577     Instruction *I = dyn_cast<Instruction>(V);
1578     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1579         TheLoop->isLoopInvariant(I))
1580       return false;
1581 
1582     // Assume we can vectorize V (and hence we need extraction) if the
1583     // scalars are not computed yet. This can happen, because it is called
1584     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1585     // the scalars are collected. That should be a safe assumption in most
1586     // cases, because we check if the operands have vectorizable types
1587     // beforehand in LoopVectorizationLegality.
1588     return Scalars.find(VF) == Scalars.end() ||
1589            !isScalarAfterVectorization(I, VF);
1590   };
1591 
1592   /// Returns a range containing only operands needing to be extracted.
1593   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1594                                                    ElementCount VF) {
1595     return SmallVector<Value *, 4>(make_filter_range(
1596         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1597   }
1598 
1599 public:
1600   /// The loop that we evaluate.
1601   Loop *TheLoop;
1602 
1603   /// Predicated scalar evolution analysis.
1604   PredicatedScalarEvolution &PSE;
1605 
1606   /// Loop Info analysis.
1607   LoopInfo *LI;
1608 
1609   /// Vectorization legality.
1610   LoopVectorizationLegality *Legal;
1611 
1612   /// Vector target information.
1613   const TargetTransformInfo &TTI;
1614 
1615   /// Target Library Info.
1616   const TargetLibraryInfo *TLI;
1617 
1618   /// Demanded bits analysis.
1619   DemandedBits *DB;
1620 
1621   /// Assumption cache.
1622   AssumptionCache *AC;
1623 
1624   /// Interface to emit optimization remarks.
1625   OptimizationRemarkEmitter *ORE;
1626 
1627   const Function *TheFunction;
1628 
1629   /// Loop Vectorize Hint.
1630   const LoopVectorizeHints *Hints;
1631 
1632   /// The interleave access information contains groups of interleaved accesses
1633   /// with the same stride and close to each other.
1634   InterleavedAccessInfo &InterleaveInfo;
1635 
1636   /// Values to ignore in the cost model.
1637   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1638 
1639   /// Values to ignore in the cost model when VF > 1.
1640   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1641 };
1642 
1643 } // end namespace llvm
1644 
1645 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1646 // vectorization. The loop needs to be annotated with #pragma omp simd
1647 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1648 // vector length information is not provided, vectorization is not considered
1649 // explicit. Interleave hints are not allowed either. These limitations will be
1650 // relaxed in the future.
1651 // Please, note that we are currently forced to abuse the pragma 'clang
1652 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1653 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1654 // provides *explicit vectorization hints* (LV can bypass legal checks and
1655 // assume that vectorization is legal). However, both hints are implemented
1656 // using the same metadata (llvm.loop.vectorize, processed by
1657 // LoopVectorizeHints). This will be fixed in the future when the native IR
1658 // representation for pragma 'omp simd' is introduced.
1659 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1660                                    OptimizationRemarkEmitter *ORE) {
1661   assert(!OuterLp->empty() && "This is not an outer loop");
1662   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1663 
1664   // Only outer loops with an explicit vectorization hint are supported.
1665   // Unannotated outer loops are ignored.
1666   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1667     return false;
1668 
1669   Function *Fn = OuterLp->getHeader()->getParent();
1670   if (!Hints.allowVectorization(Fn, OuterLp,
1671                                 true /*VectorizeOnlyWhenForced*/)) {
1672     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1673     return false;
1674   }
1675 
1676   if (Hints.getInterleave() > 1) {
1677     // TODO: Interleave support is future work.
1678     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1679                          "outer loops.\n");
1680     Hints.emitRemarkWithHints();
1681     return false;
1682   }
1683 
1684   return true;
1685 }
1686 
1687 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1688                                   OptimizationRemarkEmitter *ORE,
1689                                   SmallVectorImpl<Loop *> &V) {
1690   // Collect inner loops and outer loops without irreducible control flow. For
1691   // now, only collect outer loops that have explicit vectorization hints. If we
1692   // are stress testing the VPlan H-CFG construction, we collect the outermost
1693   // loop of every loop nest.
1694   if (L.empty() || VPlanBuildStressTest ||
1695       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1696     LoopBlocksRPO RPOT(&L);
1697     RPOT.perform(LI);
1698     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1699       V.push_back(&L);
1700       // TODO: Collect inner loops inside marked outer loops in case
1701       // vectorization fails for the outer loop. Do not invoke
1702       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1703       // already known to be reducible. We can use an inherited attribute for
1704       // that.
1705       return;
1706     }
1707   }
1708   for (Loop *InnerL : L)
1709     collectSupportedLoops(*InnerL, LI, ORE, V);
1710 }
1711 
1712 namespace {
1713 
1714 /// The LoopVectorize Pass.
1715 struct LoopVectorize : public FunctionPass {
1716   /// Pass identification, replacement for typeid
1717   static char ID;
1718 
1719   LoopVectorizePass Impl;
1720 
1721   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1722                          bool VectorizeOnlyWhenForced = false)
1723       : FunctionPass(ID),
1724         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1725     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1726   }
1727 
1728   bool runOnFunction(Function &F) override {
1729     if (skipFunction(F))
1730       return false;
1731 
1732     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1733     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1734     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1735     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1736     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1737     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1738     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1739     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1740     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1741     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1742     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1743     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1744     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1745 
1746     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1747         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1748 
1749     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1750                         GetLAA, *ORE, PSI).MadeAnyChange;
1751   }
1752 
1753   void getAnalysisUsage(AnalysisUsage &AU) const override {
1754     AU.addRequired<AssumptionCacheTracker>();
1755     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1756     AU.addRequired<DominatorTreeWrapperPass>();
1757     AU.addRequired<LoopInfoWrapperPass>();
1758     AU.addRequired<ScalarEvolutionWrapperPass>();
1759     AU.addRequired<TargetTransformInfoWrapperPass>();
1760     AU.addRequired<AAResultsWrapperPass>();
1761     AU.addRequired<LoopAccessLegacyAnalysis>();
1762     AU.addRequired<DemandedBitsWrapperPass>();
1763     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1764     AU.addRequired<InjectTLIMappingsLegacy>();
1765 
1766     // We currently do not preserve loopinfo/dominator analyses with outer loop
1767     // vectorization. Until this is addressed, mark these analyses as preserved
1768     // only for non-VPlan-native path.
1769     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1770     if (!EnableVPlanNativePath) {
1771       AU.addPreserved<LoopInfoWrapperPass>();
1772       AU.addPreserved<DominatorTreeWrapperPass>();
1773     }
1774 
1775     AU.addPreserved<BasicAAWrapperPass>();
1776     AU.addPreserved<GlobalsAAWrapperPass>();
1777     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1778   }
1779 };
1780 
1781 } // end anonymous namespace
1782 
1783 //===----------------------------------------------------------------------===//
1784 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1785 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1786 //===----------------------------------------------------------------------===//
1787 
1788 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1789   // We need to place the broadcast of invariant variables outside the loop,
1790   // but only if it's proven safe to do so. Else, broadcast will be inside
1791   // vector loop body.
1792   Instruction *Instr = dyn_cast<Instruction>(V);
1793   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1794                      (!Instr ||
1795                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1796   // Place the code for broadcasting invariant variables in the new preheader.
1797   IRBuilder<>::InsertPointGuard Guard(Builder);
1798   if (SafeToHoist)
1799     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1800 
1801   // Broadcast the scalar into all locations in the vector.
1802   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1803 
1804   return Shuf;
1805 }
1806 
1807 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1808     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1809   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1810          "Expected either an induction phi-node or a truncate of it!");
1811   Value *Start = II.getStartValue();
1812 
1813   // Construct the initial value of the vector IV in the vector loop preheader
1814   auto CurrIP = Builder.saveIP();
1815   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1816   if (isa<TruncInst>(EntryVal)) {
1817     assert(Start->getType()->isIntegerTy() &&
1818            "Truncation requires an integer type");
1819     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1820     Step = Builder.CreateTrunc(Step, TruncType);
1821     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1822   }
1823   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1824   Value *SteppedStart =
1825       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1826 
1827   // We create vector phi nodes for both integer and floating-point induction
1828   // variables. Here, we determine the kind of arithmetic we will perform.
1829   Instruction::BinaryOps AddOp;
1830   Instruction::BinaryOps MulOp;
1831   if (Step->getType()->isIntegerTy()) {
1832     AddOp = Instruction::Add;
1833     MulOp = Instruction::Mul;
1834   } else {
1835     AddOp = II.getInductionOpcode();
1836     MulOp = Instruction::FMul;
1837   }
1838 
1839   // Multiply the vectorization factor by the step using integer or
1840   // floating-point arithmetic as appropriate.
1841   Value *ConstVF =
1842       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
1843   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1844 
1845   // Create a vector splat to use in the induction update.
1846   //
1847   // FIXME: If the step is non-constant, we create the vector splat with
1848   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1849   //        handle a constant vector splat.
1850   assert(!VF.isScalable() && "scalable vectors not yet supported.");
1851   Value *SplatVF = isa<Constant>(Mul)
1852                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1853                        : Builder.CreateVectorSplat(VF, Mul);
1854   Builder.restoreIP(CurrIP);
1855 
1856   // We may need to add the step a number of times, depending on the unroll
1857   // factor. The last of those goes into the PHI.
1858   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1859                                     &*LoopVectorBody->getFirstInsertionPt());
1860   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1861   Instruction *LastInduction = VecInd;
1862   for (unsigned Part = 0; Part < UF; ++Part) {
1863     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1864 
1865     if (isa<TruncInst>(EntryVal))
1866       addMetadata(LastInduction, EntryVal);
1867     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1868 
1869     LastInduction = cast<Instruction>(addFastMathFlag(
1870         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1871     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1872   }
1873 
1874   // Move the last step to the end of the latch block. This ensures consistent
1875   // placement of all induction updates.
1876   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1877   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1878   auto *ICmp = cast<Instruction>(Br->getCondition());
1879   LastInduction->moveBefore(ICmp);
1880   LastInduction->setName("vec.ind.next");
1881 
1882   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1883   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1884 }
1885 
1886 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1887   return Cost->isScalarAfterVectorization(I, VF) ||
1888          Cost->isProfitableToScalarize(I, VF);
1889 }
1890 
1891 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1892   if (shouldScalarizeInstruction(IV))
1893     return true;
1894   auto isScalarInst = [&](User *U) -> bool {
1895     auto *I = cast<Instruction>(U);
1896     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1897   };
1898   return llvm::any_of(IV->users(), isScalarInst);
1899 }
1900 
1901 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1902     const InductionDescriptor &ID, const Instruction *EntryVal,
1903     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1904   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1905          "Expected either an induction phi-node or a truncate of it!");
1906 
1907   // This induction variable is not the phi from the original loop but the
1908   // newly-created IV based on the proof that casted Phi is equal to the
1909   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1910   // re-uses the same InductionDescriptor that original IV uses but we don't
1911   // have to do any recording in this case - that is done when original IV is
1912   // processed.
1913   if (isa<TruncInst>(EntryVal))
1914     return;
1915 
1916   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1917   if (Casts.empty())
1918     return;
1919   // Only the first Cast instruction in the Casts vector is of interest.
1920   // The rest of the Casts (if exist) have no uses outside the
1921   // induction update chain itself.
1922   Instruction *CastInst = *Casts.begin();
1923   if (Lane < UINT_MAX)
1924     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1925   else
1926     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1927 }
1928 
1929 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1930   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1931          "Primary induction variable must have an integer type");
1932 
1933   auto II = Legal->getInductionVars().find(IV);
1934   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1935 
1936   auto ID = II->second;
1937   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1938 
1939   // The value from the original loop to which we are mapping the new induction
1940   // variable.
1941   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1942 
1943   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1944 
1945   // Generate code for the induction step. Note that induction steps are
1946   // required to be loop-invariant
1947   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1948     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1949            "Induction step should be loop invariant");
1950     if (PSE.getSE()->isSCEVable(IV->getType())) {
1951       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1952       return Exp.expandCodeFor(Step, Step->getType(),
1953                                LoopVectorPreHeader->getTerminator());
1954     }
1955     return cast<SCEVUnknown>(Step)->getValue();
1956   };
1957 
1958   // The scalar value to broadcast. This is derived from the canonical
1959   // induction variable. If a truncation type is given, truncate the canonical
1960   // induction variable and step. Otherwise, derive these values from the
1961   // induction descriptor.
1962   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1963     Value *ScalarIV = Induction;
1964     if (IV != OldInduction) {
1965       ScalarIV = IV->getType()->isIntegerTy()
1966                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1967                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1968                                           IV->getType());
1969       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1970       ScalarIV->setName("offset.idx");
1971     }
1972     if (Trunc) {
1973       auto *TruncType = cast<IntegerType>(Trunc->getType());
1974       assert(Step->getType()->isIntegerTy() &&
1975              "Truncation requires an integer step");
1976       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1977       Step = Builder.CreateTrunc(Step, TruncType);
1978     }
1979     return ScalarIV;
1980   };
1981 
1982   // Create the vector values from the scalar IV, in the absence of creating a
1983   // vector IV.
1984   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1985     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1986     for (unsigned Part = 0; Part < UF; ++Part) {
1987       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1988       Value *EntryPart =
1989           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
1990                         ID.getInductionOpcode());
1991       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1992       if (Trunc)
1993         addMetadata(EntryPart, Trunc);
1994       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1995     }
1996   };
1997 
1998   // Now do the actual transformations, and start with creating the step value.
1999   Value *Step = CreateStepValue(ID.getStep());
2000   if (VF.isZero() || VF.isScalar()) {
2001     Value *ScalarIV = CreateScalarIV(Step);
2002     CreateSplatIV(ScalarIV, Step);
2003     return;
2004   }
2005 
2006   // Determine if we want a scalar version of the induction variable. This is
2007   // true if the induction variable itself is not widened, or if it has at
2008   // least one user in the loop that is not widened.
2009   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2010   if (!NeedsScalarIV) {
2011     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2012     return;
2013   }
2014 
2015   // Try to create a new independent vector induction variable. If we can't
2016   // create the phi node, we will splat the scalar induction variable in each
2017   // loop iteration.
2018   if (!shouldScalarizeInstruction(EntryVal)) {
2019     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2020     Value *ScalarIV = CreateScalarIV(Step);
2021     // Create scalar steps that can be used by instructions we will later
2022     // scalarize. Note that the addition of the scalar steps will not increase
2023     // the number of instructions in the loop in the common case prior to
2024     // InstCombine. We will be trading one vector extract for each scalar step.
2025     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2026     return;
2027   }
2028 
2029   // All IV users are scalar instructions, so only emit a scalar IV, not a
2030   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2031   // predicate used by the masked loads/stores.
2032   Value *ScalarIV = CreateScalarIV(Step);
2033   if (!Cost->isScalarEpilogueAllowed())
2034     CreateSplatIV(ScalarIV, Step);
2035   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2036 }
2037 
2038 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2039                                           Instruction::BinaryOps BinOp) {
2040   // Create and check the types.
2041   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2042   int VLen = ValVTy->getNumElements();
2043 
2044   Type *STy = Val->getType()->getScalarType();
2045   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2046          "Induction Step must be an integer or FP");
2047   assert(Step->getType() == STy && "Step has wrong type");
2048 
2049   SmallVector<Constant *, 8> Indices;
2050 
2051   if (STy->isIntegerTy()) {
2052     // Create a vector of consecutive numbers from zero to VF.
2053     for (int i = 0; i < VLen; ++i)
2054       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2055 
2056     // Add the consecutive indices to the vector value.
2057     Constant *Cv = ConstantVector::get(Indices);
2058     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2059     Step = Builder.CreateVectorSplat(VLen, Step);
2060     assert(Step->getType() == Val->getType() && "Invalid step vec");
2061     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2062     // which can be found from the original scalar operations.
2063     Step = Builder.CreateMul(Cv, Step);
2064     return Builder.CreateAdd(Val, Step, "induction");
2065   }
2066 
2067   // Floating point induction.
2068   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2069          "Binary Opcode should be specified for FP induction");
2070   // Create a vector of consecutive numbers from zero to VF.
2071   for (int i = 0; i < VLen; ++i)
2072     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2073 
2074   // Add the consecutive indices to the vector value.
2075   Constant *Cv = ConstantVector::get(Indices);
2076 
2077   Step = Builder.CreateVectorSplat(VLen, Step);
2078 
2079   // Floating point operations had to be 'fast' to enable the induction.
2080   FastMathFlags Flags;
2081   Flags.setFast();
2082 
2083   Value *MulOp = Builder.CreateFMul(Cv, Step);
2084   if (isa<Instruction>(MulOp))
2085     // Have to check, MulOp may be a constant
2086     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2087 
2088   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2089   if (isa<Instruction>(BOp))
2090     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2091   return BOp;
2092 }
2093 
2094 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2095                                            Instruction *EntryVal,
2096                                            const InductionDescriptor &ID) {
2097   // We shouldn't have to build scalar steps if we aren't vectorizing.
2098   assert(VF.isVector() && "VF should be greater than one");
2099   assert(!VF.isScalable() &&
2100          "the code below assumes a fixed number of elements at compile time");
2101   // Get the value type and ensure it and the step have the same integer type.
2102   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2103   assert(ScalarIVTy == Step->getType() &&
2104          "Val and Step should have the same type");
2105 
2106   // We build scalar steps for both integer and floating-point induction
2107   // variables. Here, we determine the kind of arithmetic we will perform.
2108   Instruction::BinaryOps AddOp;
2109   Instruction::BinaryOps MulOp;
2110   if (ScalarIVTy->isIntegerTy()) {
2111     AddOp = Instruction::Add;
2112     MulOp = Instruction::Mul;
2113   } else {
2114     AddOp = ID.getInductionOpcode();
2115     MulOp = Instruction::FMul;
2116   }
2117 
2118   // Determine the number of scalars we need to generate for each unroll
2119   // iteration. If EntryVal is uniform, we only need to generate the first
2120   // lane. Otherwise, we generate all VF values.
2121   unsigned Lanes =
2122       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2123           ? 1
2124           : VF.getKnownMinValue();
2125   // Compute the scalar steps and save the results in VectorLoopValueMap.
2126   for (unsigned Part = 0; Part < UF; ++Part) {
2127     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2128       auto *StartIdx = getSignedIntOrFpConstant(
2129           ScalarIVTy, VF.getKnownMinValue() * Part + Lane);
2130       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2131       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2132       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2133       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2134     }
2135   }
2136 }
2137 
2138 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2139   assert(V != Induction && "The new induction variable should not be used.");
2140   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2141   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2142 
2143   // If we have a stride that is replaced by one, do it here. Defer this for
2144   // the VPlan-native path until we start running Legal checks in that path.
2145   if (!EnableVPlanNativePath && Legal->hasStride(V))
2146     V = ConstantInt::get(V->getType(), 1);
2147 
2148   // If we have a vector mapped to this value, return it.
2149   if (VectorLoopValueMap.hasVectorValue(V, Part))
2150     return VectorLoopValueMap.getVectorValue(V, Part);
2151 
2152   // If the value has not been vectorized, check if it has been scalarized
2153   // instead. If it has been scalarized, and we actually need the value in
2154   // vector form, we will construct the vector values on demand.
2155   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2156     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2157 
2158     // If we've scalarized a value, that value should be an instruction.
2159     auto *I = cast<Instruction>(V);
2160 
2161     // If we aren't vectorizing, we can just copy the scalar map values over to
2162     // the vector map.
2163     if (VF == 1) {
2164       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2165       return ScalarValue;
2166     }
2167 
2168     // Get the last scalar instruction we generated for V and Part. If the value
2169     // is known to be uniform after vectorization, this corresponds to lane zero
2170     // of the Part unroll iteration. Otherwise, the last instruction is the one
2171     // we created for the last vector lane of the Part unroll iteration.
2172     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2173     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2174                             ? 0
2175                             : VF.getKnownMinValue() - 1;
2176     auto *LastInst = cast<Instruction>(
2177         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2178 
2179     // Set the insert point after the last scalarized instruction. This ensures
2180     // the insertelement sequence will directly follow the scalar definitions.
2181     auto OldIP = Builder.saveIP();
2182     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2183     Builder.SetInsertPoint(&*NewIP);
2184 
2185     // However, if we are vectorizing, we need to construct the vector values.
2186     // If the value is known to be uniform after vectorization, we can just
2187     // broadcast the scalar value corresponding to lane zero for each unroll
2188     // iteration. Otherwise, we construct the vector values using insertelement
2189     // instructions. Since the resulting vectors are stored in
2190     // VectorLoopValueMap, we will only generate the insertelements once.
2191     Value *VectorValue = nullptr;
2192     if (Cost->isUniformAfterVectorization(I, VF)) {
2193       VectorValue = getBroadcastInstrs(ScalarValue);
2194       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2195     } else {
2196       // Initialize packing with insertelements to start from undef.
2197       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2198       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2199       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2200       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2201         packScalarIntoVectorValue(V, {Part, Lane});
2202       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2203     }
2204     Builder.restoreIP(OldIP);
2205     return VectorValue;
2206   }
2207 
2208   // If this scalar is unknown, assume that it is a constant or that it is
2209   // loop invariant. Broadcast V and save the value for future uses.
2210   Value *B = getBroadcastInstrs(V);
2211   VectorLoopValueMap.setVectorValue(V, Part, B);
2212   return B;
2213 }
2214 
2215 Value *
2216 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2217                                             const VPIteration &Instance) {
2218   // If the value is not an instruction contained in the loop, it should
2219   // already be scalar.
2220   if (OrigLoop->isLoopInvariant(V))
2221     return V;
2222 
2223   assert(Instance.Lane > 0
2224              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2225              : true && "Uniform values only have lane zero");
2226 
2227   // If the value from the original loop has not been vectorized, it is
2228   // represented by UF x VF scalar values in the new loop. Return the requested
2229   // scalar value.
2230   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2231     return VectorLoopValueMap.getScalarValue(V, Instance);
2232 
2233   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2234   // for the given unroll part. If this entry is not a vector type (i.e., the
2235   // vectorization factor is one), there is no need to generate an
2236   // extractelement instruction.
2237   auto *U = getOrCreateVectorValue(V, Instance.Part);
2238   if (!U->getType()->isVectorTy()) {
2239     assert(VF == 1 && "Value not scalarized has non-vector type");
2240     return U;
2241   }
2242 
2243   // Otherwise, the value from the original loop has been vectorized and is
2244   // represented by UF vector values. Extract and return the requested scalar
2245   // value from the appropriate vector lane.
2246   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2247 }
2248 
2249 void InnerLoopVectorizer::packScalarIntoVectorValue(
2250     Value *V, const VPIteration &Instance) {
2251   assert(V != Induction && "The new induction variable should not be used.");
2252   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2253   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2254 
2255   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2256   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2257   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2258                                             Builder.getInt32(Instance.Lane));
2259   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2260 }
2261 
2262 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2263   assert(Vec->getType()->isVectorTy() && "Invalid type");
2264   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2265   SmallVector<int, 8> ShuffleMask;
2266   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2267     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2268 
2269   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2270                                      ShuffleMask, "reverse");
2271 }
2272 
2273 // Return whether we allow using masked interleave-groups (for dealing with
2274 // strided loads/stores that reside in predicated blocks, or for dealing
2275 // with gaps).
2276 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2277   // If an override option has been passed in for interleaved accesses, use it.
2278   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2279     return EnableMaskedInterleavedMemAccesses;
2280 
2281   return TTI.enableMaskedInterleavedAccessVectorization();
2282 }
2283 
2284 // Try to vectorize the interleave group that \p Instr belongs to.
2285 //
2286 // E.g. Translate following interleaved load group (factor = 3):
2287 //   for (i = 0; i < N; i+=3) {
2288 //     R = Pic[i];             // Member of index 0
2289 //     G = Pic[i+1];           // Member of index 1
2290 //     B = Pic[i+2];           // Member of index 2
2291 //     ... // do something to R, G, B
2292 //   }
2293 // To:
2294 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2295 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2296 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2297 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2298 //
2299 // Or translate following interleaved store group (factor = 3):
2300 //   for (i = 0; i < N; i+=3) {
2301 //     ... do something to R, G, B
2302 //     Pic[i]   = R;           // Member of index 0
2303 //     Pic[i+1] = G;           // Member of index 1
2304 //     Pic[i+2] = B;           // Member of index 2
2305 //   }
2306 // To:
2307 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2308 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2309 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2310 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2311 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2312 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2313     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2314     VPValue *Addr, VPValue *BlockInMask) {
2315   Instruction *Instr = Group->getInsertPos();
2316   const DataLayout &DL = Instr->getModule()->getDataLayout();
2317 
2318   // Prepare for the vector type of the interleaved load/store.
2319   Type *ScalarTy = getMemInstValueType(Instr);
2320   unsigned InterleaveFactor = Group->getFactor();
2321   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2322   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2323 
2324   // Prepare for the new pointers.
2325   SmallVector<Value *, 2> AddrParts;
2326   unsigned Index = Group->getIndex(Instr);
2327 
2328   // TODO: extend the masked interleaved-group support to reversed access.
2329   assert((!BlockInMask || !Group->isReverse()) &&
2330          "Reversed masked interleave-group not supported.");
2331 
2332   // If the group is reverse, adjust the index to refer to the last vector lane
2333   // instead of the first. We adjust the index from the first vector lane,
2334   // rather than directly getting the pointer for lane VF - 1, because the
2335   // pointer operand of the interleaved access is supposed to be uniform. For
2336   // uniform instructions, we're only required to generate a value for the
2337   // first vector lane in each unroll iteration.
2338   assert(!VF.isScalable() &&
2339          "scalable vector reverse operation is not implemented");
2340   if (Group->isReverse())
2341     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2342 
2343   for (unsigned Part = 0; Part < UF; Part++) {
2344     Value *AddrPart = State.get(Addr, {Part, 0});
2345     setDebugLocFromInst(Builder, AddrPart);
2346 
2347     // Notice current instruction could be any index. Need to adjust the address
2348     // to the member of index 0.
2349     //
2350     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2351     //       b = A[i];       // Member of index 0
2352     // Current pointer is pointed to A[i+1], adjust it to A[i].
2353     //
2354     // E.g.  A[i+1] = a;     // Member of index 1
2355     //       A[i]   = b;     // Member of index 0
2356     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2357     // Current pointer is pointed to A[i+2], adjust it to A[i].
2358 
2359     bool InBounds = false;
2360     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2361       InBounds = gep->isInBounds();
2362     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2363     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2364 
2365     // Cast to the vector pointer type.
2366     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2367     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2368     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2369   }
2370 
2371   setDebugLocFromInst(Builder, Instr);
2372   Value *UndefVec = UndefValue::get(VecTy);
2373 
2374   Value *MaskForGaps = nullptr;
2375   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2376     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2377     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2378     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2379   }
2380 
2381   // Vectorize the interleaved load group.
2382   if (isa<LoadInst>(Instr)) {
2383     // For each unroll part, create a wide load for the group.
2384     SmallVector<Value *, 2> NewLoads;
2385     for (unsigned Part = 0; Part < UF; Part++) {
2386       Instruction *NewLoad;
2387       if (BlockInMask || MaskForGaps) {
2388         assert(useMaskedInterleavedAccesses(*TTI) &&
2389                "masked interleaved groups are not allowed.");
2390         Value *GroupMask = MaskForGaps;
2391         if (BlockInMask) {
2392           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2393           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2394           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2395           Value *ShuffledMask = Builder.CreateShuffleVector(
2396               BlockInMaskPart, Undefs,
2397               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2398               "interleaved.mask");
2399           GroupMask = MaskForGaps
2400                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2401                                                 MaskForGaps)
2402                           : ShuffledMask;
2403         }
2404         NewLoad =
2405             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2406                                      GroupMask, UndefVec, "wide.masked.vec");
2407       }
2408       else
2409         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2410                                             Group->getAlign(), "wide.vec");
2411       Group->addMetadata(NewLoad);
2412       NewLoads.push_back(NewLoad);
2413     }
2414 
2415     // For each member in the group, shuffle out the appropriate data from the
2416     // wide loads.
2417     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2418       Instruction *Member = Group->getMember(I);
2419 
2420       // Skip the gaps in the group.
2421       if (!Member)
2422         continue;
2423 
2424       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2425       auto StrideMask =
2426           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2427       for (unsigned Part = 0; Part < UF; Part++) {
2428         Value *StridedVec = Builder.CreateShuffleVector(
2429             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2430 
2431         // If this member has different type, cast the result type.
2432         if (Member->getType() != ScalarTy) {
2433           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2434           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2435           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2436         }
2437 
2438         if (Group->isReverse())
2439           StridedVec = reverseVector(StridedVec);
2440 
2441         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2442       }
2443     }
2444     return;
2445   }
2446 
2447   // The sub vector type for current instruction.
2448   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2449   auto *SubVT = VectorType::get(ScalarTy, VF);
2450 
2451   // Vectorize the interleaved store group.
2452   for (unsigned Part = 0; Part < UF; Part++) {
2453     // Collect the stored vector from each member.
2454     SmallVector<Value *, 4> StoredVecs;
2455     for (unsigned i = 0; i < InterleaveFactor; i++) {
2456       // Interleaved store group doesn't allow a gap, so each index has a member
2457       Instruction *Member = Group->getMember(i);
2458       assert(Member && "Fail to get a member from an interleaved store group");
2459 
2460       Value *StoredVec = getOrCreateVectorValue(
2461           cast<StoreInst>(Member)->getValueOperand(), Part);
2462       if (Group->isReverse())
2463         StoredVec = reverseVector(StoredVec);
2464 
2465       // If this member has different type, cast it to a unified type.
2466 
2467       if (StoredVec->getType() != SubVT)
2468         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2469 
2470       StoredVecs.push_back(StoredVec);
2471     }
2472 
2473     // Concatenate all vectors into a wide vector.
2474     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2475 
2476     // Interleave the elements in the wide vector.
2477     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2478     Value *IVec = Builder.CreateShuffleVector(
2479         WideVec, UndefVec,
2480         createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2481         "interleaved.vec");
2482 
2483     Instruction *NewStoreInstr;
2484     if (BlockInMask) {
2485       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2486       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2487       Value *ShuffledMask = Builder.CreateShuffleVector(
2488           BlockInMaskPart, Undefs,
2489           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2490           "interleaved.mask");
2491       NewStoreInstr = Builder.CreateMaskedStore(
2492           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2493     }
2494     else
2495       NewStoreInstr =
2496           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2497 
2498     Group->addMetadata(NewStoreInstr);
2499   }
2500 }
2501 
2502 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2503                                                      VPTransformState &State,
2504                                                      VPValue *Addr,
2505                                                      VPValue *StoredValue,
2506                                                      VPValue *BlockInMask) {
2507   // Attempt to issue a wide load.
2508   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2509   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2510 
2511   assert((LI || SI) && "Invalid Load/Store instruction");
2512   assert((!SI || StoredValue) && "No stored value provided for widened store");
2513   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2514 
2515   LoopVectorizationCostModel::InstWidening Decision =
2516       Cost->getWideningDecision(Instr, VF);
2517   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2518           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2519           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2520          "CM decision is not to widen the memory instruction");
2521 
2522   Type *ScalarDataTy = getMemInstValueType(Instr);
2523 
2524   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2525   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2526   const Align Alignment = getLoadStoreAlignment(Instr);
2527 
2528   // Determine if the pointer operand of the access is either consecutive or
2529   // reverse consecutive.
2530   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2531   bool ConsecutiveStride =
2532       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2533   bool CreateGatherScatter =
2534       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2535 
2536   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2537   // gather/scatter. Otherwise Decision should have been to Scalarize.
2538   assert((ConsecutiveStride || CreateGatherScatter) &&
2539          "The instruction should be scalarized");
2540   (void)ConsecutiveStride;
2541 
2542   VectorParts BlockInMaskParts(UF);
2543   bool isMaskRequired = BlockInMask;
2544   if (isMaskRequired)
2545     for (unsigned Part = 0; Part < UF; ++Part)
2546       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2547 
2548   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2549     // Calculate the pointer for the specific unroll-part.
2550     GetElementPtrInst *PartPtr = nullptr;
2551 
2552     bool InBounds = false;
2553     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2554       InBounds = gep->isInBounds();
2555 
2556     if (Reverse) {
2557       // If the address is consecutive but reversed, then the
2558       // wide store needs to start at the last vector element.
2559       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2560           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2561       PartPtr->setIsInBounds(InBounds);
2562       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2563           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2564       PartPtr->setIsInBounds(InBounds);
2565       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2566         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2567     } else {
2568       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2569           ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));
2570       PartPtr->setIsInBounds(InBounds);
2571     }
2572 
2573     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2574     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2575   };
2576 
2577   // Handle Stores:
2578   if (SI) {
2579     setDebugLocFromInst(Builder, SI);
2580 
2581     for (unsigned Part = 0; Part < UF; ++Part) {
2582       Instruction *NewSI = nullptr;
2583       Value *StoredVal = State.get(StoredValue, Part);
2584       if (CreateGatherScatter) {
2585         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2586         Value *VectorGep = State.get(Addr, Part);
2587         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2588                                             MaskPart);
2589       } else {
2590         if (Reverse) {
2591           // If we store to reverse consecutive memory locations, then we need
2592           // to reverse the order of elements in the stored value.
2593           StoredVal = reverseVector(StoredVal);
2594           // We don't want to update the value in the map as it might be used in
2595           // another expression. So don't call resetVectorValue(StoredVal).
2596         }
2597         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2598         if (isMaskRequired)
2599           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2600                                             BlockInMaskParts[Part]);
2601         else
2602           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2603       }
2604       addMetadata(NewSI, SI);
2605     }
2606     return;
2607   }
2608 
2609   // Handle loads.
2610   assert(LI && "Must have a load instruction");
2611   setDebugLocFromInst(Builder, LI);
2612   for (unsigned Part = 0; Part < UF; ++Part) {
2613     Value *NewLI;
2614     if (CreateGatherScatter) {
2615       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2616       Value *VectorGep = State.get(Addr, Part);
2617       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2618                                          nullptr, "wide.masked.gather");
2619       addMetadata(NewLI, LI);
2620     } else {
2621       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2622       if (isMaskRequired)
2623         NewLI = Builder.CreateMaskedLoad(
2624             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2625             "wide.masked.load");
2626       else
2627         NewLI =
2628             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2629 
2630       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2631       addMetadata(NewLI, LI);
2632       if (Reverse)
2633         NewLI = reverseVector(NewLI);
2634     }
2635     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2636   }
2637 }
2638 
2639 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2640                                                const VPIteration &Instance,
2641                                                bool IfPredicateInstr,
2642                                                VPTransformState &State) {
2643   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2644 
2645   setDebugLocFromInst(Builder, Instr);
2646 
2647   // Does this instruction return a value ?
2648   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2649 
2650   Instruction *Cloned = Instr->clone();
2651   if (!IsVoidRetTy)
2652     Cloned->setName(Instr->getName() + ".cloned");
2653 
2654   // Replace the operands of the cloned instructions with their scalar
2655   // equivalents in the new loop.
2656   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2657     auto *NewOp = State.get(User.getOperand(op), Instance);
2658     Cloned->setOperand(op, NewOp);
2659   }
2660   addNewMetadata(Cloned, Instr);
2661 
2662   // Place the cloned scalar in the new loop.
2663   Builder.Insert(Cloned);
2664 
2665   // Add the cloned scalar to the scalar map entry.
2666   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2667 
2668   // If we just cloned a new assumption, add it the assumption cache.
2669   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2670     if (II->getIntrinsicID() == Intrinsic::assume)
2671       AC->registerAssumption(II);
2672 
2673   // End if-block.
2674   if (IfPredicateInstr)
2675     PredicatedInstructions.push_back(Cloned);
2676 }
2677 
2678 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2679                                                       Value *End, Value *Step,
2680                                                       Instruction *DL) {
2681   BasicBlock *Header = L->getHeader();
2682   BasicBlock *Latch = L->getLoopLatch();
2683   // As we're just creating this loop, it's possible no latch exists
2684   // yet. If so, use the header as this will be a single block loop.
2685   if (!Latch)
2686     Latch = Header;
2687 
2688   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2689   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2690   setDebugLocFromInst(Builder, OldInst);
2691   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2692 
2693   Builder.SetInsertPoint(Latch->getTerminator());
2694   setDebugLocFromInst(Builder, OldInst);
2695 
2696   // Create i+1 and fill the PHINode.
2697   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2698   Induction->addIncoming(Start, L->getLoopPreheader());
2699   Induction->addIncoming(Next, Latch);
2700   // Create the compare.
2701   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2702   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2703 
2704   // Now we have two terminators. Remove the old one from the block.
2705   Latch->getTerminator()->eraseFromParent();
2706 
2707   return Induction;
2708 }
2709 
2710 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2711   if (TripCount)
2712     return TripCount;
2713 
2714   assert(L && "Create Trip Count for null loop.");
2715   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2716   // Find the loop boundaries.
2717   ScalarEvolution *SE = PSE.getSE();
2718   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2719   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2720          "Invalid loop count");
2721 
2722   Type *IdxTy = Legal->getWidestInductionType();
2723   assert(IdxTy && "No type for induction");
2724 
2725   // The exit count might have the type of i64 while the phi is i32. This can
2726   // happen if we have an induction variable that is sign extended before the
2727   // compare. The only way that we get a backedge taken count is that the
2728   // induction variable was signed and as such will not overflow. In such a case
2729   // truncation is legal.
2730   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2731       IdxTy->getPrimitiveSizeInBits())
2732     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2733   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2734 
2735   // Get the total trip count from the count by adding 1.
2736   const SCEV *ExitCount = SE->getAddExpr(
2737       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2738 
2739   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2740 
2741   // Expand the trip count and place the new instructions in the preheader.
2742   // Notice that the pre-header does not change, only the loop body.
2743   SCEVExpander Exp(*SE, DL, "induction");
2744 
2745   // Count holds the overall loop count (N).
2746   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2747                                 L->getLoopPreheader()->getTerminator());
2748 
2749   if (TripCount->getType()->isPointerTy())
2750     TripCount =
2751         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2752                                     L->getLoopPreheader()->getTerminator());
2753 
2754   return TripCount;
2755 }
2756 
2757 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2758   if (VectorTripCount)
2759     return VectorTripCount;
2760 
2761   Value *TC = getOrCreateTripCount(L);
2762   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2763 
2764   Type *Ty = TC->getType();
2765   // This is where we can make the step a runtime constant.
2766   assert(!VF.isScalable() && "scalable vectorization is not supported yet");
2767   Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF);
2768 
2769   // If the tail is to be folded by masking, round the number of iterations N
2770   // up to a multiple of Step instead of rounding down. This is done by first
2771   // adding Step-1 and then rounding down. Note that it's ok if this addition
2772   // overflows: the vector induction variable will eventually wrap to zero given
2773   // that it starts at zero and its Step is a power of two; the loop will then
2774   // exit, with the last early-exit vector comparison also producing all-true.
2775   if (Cost->foldTailByMasking()) {
2776     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2777            "VF*UF must be a power of 2 when folding tail by masking");
2778     TC = Builder.CreateAdd(
2779         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2780   }
2781 
2782   // Now we need to generate the expression for the part of the loop that the
2783   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2784   // iterations are not required for correctness, or N - Step, otherwise. Step
2785   // is equal to the vectorization factor (number of SIMD elements) times the
2786   // unroll factor (number of SIMD instructions).
2787   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2788 
2789   // If there is a non-reversed interleaved group that may speculatively access
2790   // memory out-of-bounds, we need to ensure that there will be at least one
2791   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2792   // the trip count, we set the remainder to be equal to the step. If the step
2793   // does not evenly divide the trip count, no adjustment is necessary since
2794   // there will already be scalar iterations. Note that the minimum iterations
2795   // check ensures that N >= Step.
2796   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
2797     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2798     R = Builder.CreateSelect(IsZero, Step, R);
2799   }
2800 
2801   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2802 
2803   return VectorTripCount;
2804 }
2805 
2806 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2807                                                    const DataLayout &DL) {
2808   // Verify that V is a vector type with same number of elements as DstVTy.
2809   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2810   unsigned VF = DstFVTy->getNumElements();
2811   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2812   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2813   Type *SrcElemTy = SrcVecTy->getElementType();
2814   Type *DstElemTy = DstFVTy->getElementType();
2815   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2816          "Vector elements must have same size");
2817 
2818   // Do a direct cast if element types are castable.
2819   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2820     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2821   }
2822   // V cannot be directly casted to desired vector type.
2823   // May happen when V is a floating point vector but DstVTy is a vector of
2824   // pointers or vice-versa. Handle this using a two-step bitcast using an
2825   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2826   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2827          "Only one type should be a pointer type");
2828   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2829          "Only one type should be a floating point type");
2830   Type *IntTy =
2831       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2832   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2833   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2834   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2835 }
2836 
2837 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2838                                                          BasicBlock *Bypass) {
2839   Value *Count = getOrCreateTripCount(L);
2840   // Reuse existing vector loop preheader for TC checks.
2841   // Note that new preheader block is generated for vector loop.
2842   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2843   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2844 
2845   // Generate code to check if the loop's trip count is less than VF * UF, or
2846   // equal to it in case a scalar epilogue is required; this implies that the
2847   // vector trip count is zero. This check also covers the case where adding one
2848   // to the backedge-taken count overflowed leading to an incorrect trip count
2849   // of zero. In this case we will also jump to the scalar loop.
2850   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2851                                           : ICmpInst::ICMP_ULT;
2852 
2853   // If tail is to be folded, vector loop takes care of all iterations.
2854   Value *CheckMinIters = Builder.getFalse();
2855   if (!Cost->foldTailByMasking()) {
2856     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2857     CheckMinIters = Builder.CreateICmp(
2858         P, Count,
2859         ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
2860         "min.iters.check");
2861   }
2862   // Create new preheader for vector loop.
2863   LoopVectorPreHeader =
2864       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2865                  "vector.ph");
2866 
2867   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2868                                DT->getNode(Bypass)->getIDom()) &&
2869          "TC check is expected to dominate Bypass");
2870 
2871   // Update dominator for Bypass & LoopExit.
2872   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2873   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2874 
2875   ReplaceInstWithInst(
2876       TCCheckBlock->getTerminator(),
2877       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2878   LoopBypassBlocks.push_back(TCCheckBlock);
2879 }
2880 
2881 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2882   // Reuse existing vector loop preheader for SCEV checks.
2883   // Note that new preheader block is generated for vector loop.
2884   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2885 
2886   // Generate the code to check that the SCEV assumptions that we made.
2887   // We want the new basic block to start at the first instruction in a
2888   // sequence of instructions that form a check.
2889   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2890                    "scev.check");
2891   Value *SCEVCheck = Exp.expandCodeForPredicate(
2892       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2893 
2894   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2895     if (C->isZero())
2896       return;
2897 
2898   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2899            (OptForSizeBasedOnProfile &&
2900             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2901          "Cannot SCEV check stride or overflow when optimizing for size");
2902 
2903   SCEVCheckBlock->setName("vector.scevcheck");
2904   // Create new preheader for vector loop.
2905   LoopVectorPreHeader =
2906       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2907                  nullptr, "vector.ph");
2908 
2909   // Update dominator only if this is first RT check.
2910   if (LoopBypassBlocks.empty()) {
2911     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2912     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2913   }
2914 
2915   ReplaceInstWithInst(
2916       SCEVCheckBlock->getTerminator(),
2917       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2918   LoopBypassBlocks.push_back(SCEVCheckBlock);
2919   AddedSafetyChecks = true;
2920 }
2921 
2922 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2923   // VPlan-native path does not do any analysis for runtime checks currently.
2924   if (EnableVPlanNativePath)
2925     return;
2926 
2927   // Reuse existing vector loop preheader for runtime memory checks.
2928   // Note that new preheader block is generated for vector loop.
2929   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2930 
2931   // Generate the code that checks in runtime if arrays overlap. We put the
2932   // checks into a separate block to make the more common case of few elements
2933   // faster.
2934   auto *LAI = Legal->getLAI();
2935   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2936   if (!RtPtrChecking.Need)
2937     return;
2938   Instruction *FirstCheckInst;
2939   Instruction *MemRuntimeCheck;
2940   std::tie(FirstCheckInst, MemRuntimeCheck) =
2941       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2942                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2943   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
2944                             "claimed checks are required");
2945 
2946   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2947     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2948            "Cannot emit memory checks when optimizing for size, unless forced "
2949            "to vectorize.");
2950     ORE->emit([&]() {
2951       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2952                                         L->getStartLoc(), L->getHeader())
2953              << "Code-size may be reduced by not forcing "
2954                 "vectorization, or by source-code modifications "
2955                 "eliminating the need for runtime checks "
2956                 "(e.g., adding 'restrict').";
2957     });
2958   }
2959 
2960   MemCheckBlock->setName("vector.memcheck");
2961   // Create new preheader for vector loop.
2962   LoopVectorPreHeader =
2963       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2964                  "vector.ph");
2965 
2966   // Update dominator only if this is first RT check.
2967   if (LoopBypassBlocks.empty()) {
2968     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2969     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2970   }
2971 
2972   ReplaceInstWithInst(
2973       MemCheckBlock->getTerminator(),
2974       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2975   LoopBypassBlocks.push_back(MemCheckBlock);
2976   AddedSafetyChecks = true;
2977 
2978   // We currently don't use LoopVersioning for the actual loop cloning but we
2979   // still use it to add the noalias metadata.
2980   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2981                                           PSE.getSE());
2982   LVer->prepareNoAliasMetadata();
2983 }
2984 
2985 Value *InnerLoopVectorizer::emitTransformedIndex(
2986     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2987     const InductionDescriptor &ID) const {
2988 
2989   SCEVExpander Exp(*SE, DL, "induction");
2990   auto Step = ID.getStep();
2991   auto StartValue = ID.getStartValue();
2992   assert(Index->getType() == Step->getType() &&
2993          "Index type does not match StepValue type");
2994 
2995   // Note: the IR at this point is broken. We cannot use SE to create any new
2996   // SCEV and then expand it, hoping that SCEV's simplification will give us
2997   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2998   // lead to various SCEV crashes. So all we can do is to use builder and rely
2999   // on InstCombine for future simplifications. Here we handle some trivial
3000   // cases only.
3001   auto CreateAdd = [&B](Value *X, Value *Y) {
3002     assert(X->getType() == Y->getType() && "Types don't match!");
3003     if (auto *CX = dyn_cast<ConstantInt>(X))
3004       if (CX->isZero())
3005         return Y;
3006     if (auto *CY = dyn_cast<ConstantInt>(Y))
3007       if (CY->isZero())
3008         return X;
3009     return B.CreateAdd(X, Y);
3010   };
3011 
3012   auto CreateMul = [&B](Value *X, Value *Y) {
3013     assert(X->getType() == Y->getType() && "Types don't match!");
3014     if (auto *CX = dyn_cast<ConstantInt>(X))
3015       if (CX->isOne())
3016         return Y;
3017     if (auto *CY = dyn_cast<ConstantInt>(Y))
3018       if (CY->isOne())
3019         return X;
3020     return B.CreateMul(X, Y);
3021   };
3022 
3023   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3024   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3025   // the DomTree is not kept up-to-date for additional blocks generated in the
3026   // vector loop. By using the header as insertion point, we guarantee that the
3027   // expanded instructions dominate all their uses.
3028   auto GetInsertPoint = [this, &B]() {
3029     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3030     if (InsertBB != LoopVectorBody &&
3031         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3032       return LoopVectorBody->getTerminator();
3033     return &*B.GetInsertPoint();
3034   };
3035   switch (ID.getKind()) {
3036   case InductionDescriptor::IK_IntInduction: {
3037     assert(Index->getType() == StartValue->getType() &&
3038            "Index type does not match StartValue type");
3039     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3040       return B.CreateSub(StartValue, Index);
3041     auto *Offset = CreateMul(
3042         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3043     return CreateAdd(StartValue, Offset);
3044   }
3045   case InductionDescriptor::IK_PtrInduction: {
3046     assert(isa<SCEVConstant>(Step) &&
3047            "Expected constant step for pointer induction");
3048     return B.CreateGEP(
3049         StartValue->getType()->getPointerElementType(), StartValue,
3050         CreateMul(Index,
3051                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3052   }
3053   case InductionDescriptor::IK_FpInduction: {
3054     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3055     auto InductionBinOp = ID.getInductionBinOp();
3056     assert(InductionBinOp &&
3057            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3058             InductionBinOp->getOpcode() == Instruction::FSub) &&
3059            "Original bin op should be defined for FP induction");
3060 
3061     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3062 
3063     // Floating point operations had to be 'fast' to enable the induction.
3064     FastMathFlags Flags;
3065     Flags.setFast();
3066 
3067     Value *MulExp = B.CreateFMul(StepValue, Index);
3068     if (isa<Instruction>(MulExp))
3069       // We have to check, the MulExp may be a constant.
3070       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3071 
3072     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3073                                "induction");
3074     if (isa<Instruction>(BOp))
3075       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3076 
3077     return BOp;
3078   }
3079   case InductionDescriptor::IK_NoInduction:
3080     return nullptr;
3081   }
3082   llvm_unreachable("invalid enum");
3083 }
3084 
3085 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3086   LoopScalarBody = OrigLoop->getHeader();
3087   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3088   LoopExitBlock = OrigLoop->getExitBlock();
3089   assert(LoopExitBlock && "Must have an exit block");
3090   assert(LoopVectorPreHeader && "Invalid loop structure");
3091 
3092   LoopMiddleBlock =
3093       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3094                  LI, nullptr, Twine(Prefix) + "middle.block");
3095   LoopScalarPreHeader =
3096       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3097                  nullptr, Twine(Prefix) + "scalar.ph");
3098   // We intentionally don't let SplitBlock to update LoopInfo since
3099   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3100   // LoopVectorBody is explicitly added to the correct place few lines later.
3101   LoopVectorBody =
3102       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3103                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3104 
3105   // Update dominator for loop exit.
3106   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3107 
3108   // Create and register the new vector loop.
3109   Loop *Lp = LI->AllocateLoop();
3110   Loop *ParentLoop = OrigLoop->getParentLoop();
3111 
3112   // Insert the new loop into the loop nest and register the new basic blocks
3113   // before calling any utilities such as SCEV that require valid LoopInfo.
3114   if (ParentLoop) {
3115     ParentLoop->addChildLoop(Lp);
3116   } else {
3117     LI->addTopLevelLoop(Lp);
3118   }
3119   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3120   return Lp;
3121 }
3122 
3123 void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
3124                                                       Value *VectorTripCount) {
3125   assert(VectorTripCount && L && "Expected valid arguments");
3126   // We are going to resume the execution of the scalar loop.
3127   // Go over all of the induction variables that we found and fix the
3128   // PHIs that are left in the scalar version of the loop.
3129   // The starting values of PHI nodes depend on the counter of the last
3130   // iteration in the vectorized loop.
3131   // If we come from a bypass edge then we need to start from the original
3132   // start value.
3133   for (auto &InductionEntry : Legal->getInductionVars()) {
3134     PHINode *OrigPhi = InductionEntry.first;
3135     InductionDescriptor II = InductionEntry.second;
3136 
3137     // Create phi nodes to merge from the  backedge-taken check block.
3138     PHINode *BCResumeVal =
3139         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3140                         LoopScalarPreHeader->getTerminator());
3141     // Copy original phi DL over to the new one.
3142     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3143     Value *&EndValue = IVEndValues[OrigPhi];
3144     if (OrigPhi == OldInduction) {
3145       // We know what the end value is.
3146       EndValue = VectorTripCount;
3147     } else {
3148       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3149       Type *StepType = II.getStep()->getType();
3150       Instruction::CastOps CastOp =
3151           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3152       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3153       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3154       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3155       EndValue->setName("ind.end");
3156     }
3157 
3158     // The new PHI merges the original incoming value, in case of a bypass,
3159     // or the value at the end of the vectorized loop.
3160     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3161 
3162     // Fix the scalar body counter (PHI node).
3163     // The old induction's phi node in the scalar body needs the truncated
3164     // value.
3165     for (BasicBlock *BB : LoopBypassBlocks)
3166       BCResumeVal->addIncoming(II.getStartValue(), BB);
3167     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3168   }
3169 }
3170 
3171 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3172                                                       MDNode *OrigLoopID) {
3173   assert(L && "Expected valid loop.");
3174 
3175   // The trip counts should be cached by now.
3176   Value *Count = getOrCreateTripCount(L);
3177   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3178 
3179   // We need the OrigLoop (scalar loop part) latch terminator to help
3180   // produce correct debug info for the middle block BB instructions.
3181   // The legality check stage guarantees that the loop will have a single
3182   // latch.
3183   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3184          "Scalar loop latch terminator isn't a branch");
3185   BranchInst *ScalarLatchBr =
3186       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3187 
3188   // Add a check in the middle block to see if we have completed
3189   // all of the iterations in the first vector loop.
3190   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3191   // If tail is to be folded, we know we don't need to run the remainder.
3192   Value *CmpN = Builder.getTrue();
3193   if (!Cost->foldTailByMasking()) {
3194     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3195                            VectorTripCount, "cmp.n",
3196                            LoopMiddleBlock->getTerminator());
3197 
3198     // Here we use the same DebugLoc as the scalar loop latch branch instead
3199     // of the corresponding compare because they may have ended up with
3200     // different line numbers and we want to avoid awkward line stepping while
3201     // debugging. Eg. if the compare has got a line number inside the loop.
3202     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3203   }
3204 
3205   BranchInst *BrInst =
3206       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3207   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3208   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3209 
3210   // Get ready to start creating new instructions into the vectorized body.
3211   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3212          "Inconsistent vector loop preheader");
3213   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3214 
3215   Optional<MDNode *> VectorizedLoopID =
3216       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3217                                       LLVMLoopVectorizeFollowupVectorized});
3218   if (VectorizedLoopID.hasValue()) {
3219     L->setLoopID(VectorizedLoopID.getValue());
3220 
3221     // Do not setAlreadyVectorized if loop attributes have been defined
3222     // explicitly.
3223     return LoopVectorPreHeader;
3224   }
3225 
3226   // Keep all loop hints from the original loop on the vector loop (we'll
3227   // replace the vectorizer-specific hints below).
3228   if (MDNode *LID = OrigLoop->getLoopID())
3229     L->setLoopID(LID);
3230 
3231   LoopVectorizeHints Hints(L, true, *ORE);
3232   Hints.setAlreadyVectorized();
3233 
3234 #ifdef EXPENSIVE_CHECKS
3235   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3236   LI->verify(*DT);
3237 #endif
3238 
3239   return LoopVectorPreHeader;
3240 }
3241 
3242 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3243   /*
3244    In this function we generate a new loop. The new loop will contain
3245    the vectorized instructions while the old loop will continue to run the
3246    scalar remainder.
3247 
3248        [ ] <-- loop iteration number check.
3249     /   |
3250    /    v
3251   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3252   |  /  |
3253   | /   v
3254   ||   [ ]     <-- vector pre header.
3255   |/    |
3256   |     v
3257   |    [  ] \
3258   |    [  ]_|   <-- vector loop.
3259   |     |
3260   |     v
3261   |   -[ ]   <--- middle-block.
3262   |  /  |
3263   | /   v
3264   -|- >[ ]     <--- new preheader.
3265    |    |
3266    |    v
3267    |   [ ] \
3268    |   [ ]_|   <-- old scalar loop to handle remainder.
3269     \   |
3270      \  v
3271       >[ ]     <-- exit block.
3272    ...
3273    */
3274 
3275   // Get the metadata of the original loop before it gets modified.
3276   MDNode *OrigLoopID = OrigLoop->getLoopID();
3277 
3278   // Create an empty vector loop, and prepare basic blocks for the runtime
3279   // checks.
3280   Loop *Lp = createVectorLoopSkeleton("");
3281 
3282   // Now, compare the new count to zero. If it is zero skip the vector loop and
3283   // jump to the scalar loop. This check also covers the case where the
3284   // backedge-taken count is uint##_max: adding one to it will overflow leading
3285   // to an incorrect trip count of zero. In this (rare) case we will also jump
3286   // to the scalar loop.
3287   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3288 
3289   // Generate the code to check any assumptions that we've made for SCEV
3290   // expressions.
3291   emitSCEVChecks(Lp, LoopScalarPreHeader);
3292 
3293   // Generate the code that checks in runtime if arrays overlap. We put the
3294   // checks into a separate block to make the more common case of few elements
3295   // faster.
3296   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3297 
3298   // Some loops have a single integer induction variable, while other loops
3299   // don't. One example is c++ iterators that often have multiple pointer
3300   // induction variables. In the code below we also support a case where we
3301   // don't have a single induction variable.
3302   //
3303   // We try to obtain an induction variable from the original loop as hard
3304   // as possible. However if we don't find one that:
3305   //   - is an integer
3306   //   - counts from zero, stepping by one
3307   //   - is the size of the widest induction variable type
3308   // then we create a new one.
3309   OldInduction = Legal->getPrimaryInduction();
3310   Type *IdxTy = Legal->getWidestInductionType();
3311   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3312   // The loop step is equal to the vectorization factor (num of SIMD elements)
3313   // times the unroll factor (num of SIMD instructions).
3314   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3315   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
3316   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3317   Induction =
3318       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3319                               getDebugLocFromInstOrOperands(OldInduction));
3320 
3321   // Emit phis for the new starting index of the scalar loop.
3322   createInductionResumeValues(Lp, CountRoundDown);
3323 
3324   return completeLoopSkeleton(Lp, OrigLoopID);
3325 }
3326 
3327 // Fix up external users of the induction variable. At this point, we are
3328 // in LCSSA form, with all external PHIs that use the IV having one input value,
3329 // coming from the remainder loop. We need those PHIs to also have a correct
3330 // value for the IV when arriving directly from the middle block.
3331 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3332                                        const InductionDescriptor &II,
3333                                        Value *CountRoundDown, Value *EndValue,
3334                                        BasicBlock *MiddleBlock) {
3335   // There are two kinds of external IV usages - those that use the value
3336   // computed in the last iteration (the PHI) and those that use the penultimate
3337   // value (the value that feeds into the phi from the loop latch).
3338   // We allow both, but they, obviously, have different values.
3339 
3340   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3341 
3342   DenseMap<Value *, Value *> MissingVals;
3343 
3344   // An external user of the last iteration's value should see the value that
3345   // the remainder loop uses to initialize its own IV.
3346   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3347   for (User *U : PostInc->users()) {
3348     Instruction *UI = cast<Instruction>(U);
3349     if (!OrigLoop->contains(UI)) {
3350       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3351       MissingVals[UI] = EndValue;
3352     }
3353   }
3354 
3355   // An external user of the penultimate value need to see EndValue - Step.
3356   // The simplest way to get this is to recompute it from the constituent SCEVs,
3357   // that is Start + (Step * (CRD - 1)).
3358   for (User *U : OrigPhi->users()) {
3359     auto *UI = cast<Instruction>(U);
3360     if (!OrigLoop->contains(UI)) {
3361       const DataLayout &DL =
3362           OrigLoop->getHeader()->getModule()->getDataLayout();
3363       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3364 
3365       IRBuilder<> B(MiddleBlock->getTerminator());
3366       Value *CountMinusOne = B.CreateSub(
3367           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3368       Value *CMO =
3369           !II.getStep()->getType()->isIntegerTy()
3370               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3371                              II.getStep()->getType())
3372               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3373       CMO->setName("cast.cmo");
3374       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3375       Escape->setName("ind.escape");
3376       MissingVals[UI] = Escape;
3377     }
3378   }
3379 
3380   for (auto &I : MissingVals) {
3381     PHINode *PHI = cast<PHINode>(I.first);
3382     // One corner case we have to handle is two IVs "chasing" each-other,
3383     // that is %IV2 = phi [...], [ %IV1, %latch ]
3384     // In this case, if IV1 has an external use, we need to avoid adding both
3385     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3386     // don't already have an incoming value for the middle block.
3387     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3388       PHI->addIncoming(I.second, MiddleBlock);
3389   }
3390 }
3391 
3392 namespace {
3393 
3394 struct CSEDenseMapInfo {
3395   static bool canHandle(const Instruction *I) {
3396     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3397            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3398   }
3399 
3400   static inline Instruction *getEmptyKey() {
3401     return DenseMapInfo<Instruction *>::getEmptyKey();
3402   }
3403 
3404   static inline Instruction *getTombstoneKey() {
3405     return DenseMapInfo<Instruction *>::getTombstoneKey();
3406   }
3407 
3408   static unsigned getHashValue(const Instruction *I) {
3409     assert(canHandle(I) && "Unknown instruction!");
3410     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3411                                                            I->value_op_end()));
3412   }
3413 
3414   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3415     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3416         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3417       return LHS == RHS;
3418     return LHS->isIdenticalTo(RHS);
3419   }
3420 };
3421 
3422 } // end anonymous namespace
3423 
3424 ///Perform cse of induction variable instructions.
3425 static void cse(BasicBlock *BB) {
3426   // Perform simple cse.
3427   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3428   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3429     Instruction *In = &*I++;
3430 
3431     if (!CSEDenseMapInfo::canHandle(In))
3432       continue;
3433 
3434     // Check if we can replace this instruction with any of the
3435     // visited instructions.
3436     if (Instruction *V = CSEMap.lookup(In)) {
3437       In->replaceAllUsesWith(V);
3438       In->eraseFromParent();
3439       continue;
3440     }
3441 
3442     CSEMap[In] = In;
3443   }
3444 }
3445 
3446 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3447                                                        ElementCount VF,
3448                                                        bool &NeedToScalarize) {
3449   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3450   Function *F = CI->getCalledFunction();
3451   Type *ScalarRetTy = CI->getType();
3452   SmallVector<Type *, 4> Tys, ScalarTys;
3453   for (auto &ArgOp : CI->arg_operands())
3454     ScalarTys.push_back(ArgOp->getType());
3455 
3456   // Estimate cost of scalarized vector call. The source operands are assumed
3457   // to be vectors, so we need to extract individual elements from there,
3458   // execute VF scalar calls, and then gather the result into the vector return
3459   // value.
3460   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3461                                                  TTI::TCK_RecipThroughput);
3462   if (VF.isScalar())
3463     return ScalarCallCost;
3464 
3465   // Compute corresponding vector type for return value and arguments.
3466   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3467   for (Type *ScalarTy : ScalarTys)
3468     Tys.push_back(ToVectorTy(ScalarTy, VF));
3469 
3470   // Compute costs of unpacking argument values for the scalar calls and
3471   // packing the return values to a vector.
3472   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3473 
3474   unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3475 
3476   // If we can't emit a vector call for this function, then the currently found
3477   // cost is the cost we need to return.
3478   NeedToScalarize = true;
3479   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3480   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3481 
3482   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3483     return Cost;
3484 
3485   // If the corresponding vector cost is cheaper, return its cost.
3486   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3487                                                  TTI::TCK_RecipThroughput);
3488   if (VectorCallCost < Cost) {
3489     NeedToScalarize = false;
3490     return VectorCallCost;
3491   }
3492   return Cost;
3493 }
3494 
3495 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3496                                                             ElementCount VF) {
3497   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3498   assert(ID && "Expected intrinsic call!");
3499 
3500   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3501   return TTI.getIntrinsicInstrCost(CostAttrs,
3502                                    TargetTransformInfo::TCK_RecipThroughput);
3503 }
3504 
3505 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3506   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3507   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3508   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3509 }
3510 
3511 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3512   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3513   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3514   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3515 }
3516 
3517 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3518   // For every instruction `I` in MinBWs, truncate the operands, create a
3519   // truncated version of `I` and reextend its result. InstCombine runs
3520   // later and will remove any ext/trunc pairs.
3521   SmallPtrSet<Value *, 4> Erased;
3522   for (const auto &KV : Cost->getMinimalBitwidths()) {
3523     // If the value wasn't vectorized, we must maintain the original scalar
3524     // type. The absence of the value from VectorLoopValueMap indicates that it
3525     // wasn't vectorized.
3526     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3527       continue;
3528     for (unsigned Part = 0; Part < UF; ++Part) {
3529       Value *I = getOrCreateVectorValue(KV.first, Part);
3530       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3531         continue;
3532       Type *OriginalTy = I->getType();
3533       Type *ScalarTruncatedTy =
3534           IntegerType::get(OriginalTy->getContext(), KV.second);
3535       auto *TruncatedTy = FixedVectorType::get(
3536           ScalarTruncatedTy,
3537           cast<FixedVectorType>(OriginalTy)->getNumElements());
3538       if (TruncatedTy == OriginalTy)
3539         continue;
3540 
3541       IRBuilder<> B(cast<Instruction>(I));
3542       auto ShrinkOperand = [&](Value *V) -> Value * {
3543         if (auto *ZI = dyn_cast<ZExtInst>(V))
3544           if (ZI->getSrcTy() == TruncatedTy)
3545             return ZI->getOperand(0);
3546         return B.CreateZExtOrTrunc(V, TruncatedTy);
3547       };
3548 
3549       // The actual instruction modification depends on the instruction type,
3550       // unfortunately.
3551       Value *NewI = nullptr;
3552       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3553         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3554                              ShrinkOperand(BO->getOperand(1)));
3555 
3556         // Any wrapping introduced by shrinking this operation shouldn't be
3557         // considered undefined behavior. So, we can't unconditionally copy
3558         // arithmetic wrapping flags to NewI.
3559         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3560       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3561         NewI =
3562             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3563                          ShrinkOperand(CI->getOperand(1)));
3564       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3565         NewI = B.CreateSelect(SI->getCondition(),
3566                               ShrinkOperand(SI->getTrueValue()),
3567                               ShrinkOperand(SI->getFalseValue()));
3568       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3569         switch (CI->getOpcode()) {
3570         default:
3571           llvm_unreachable("Unhandled cast!");
3572         case Instruction::Trunc:
3573           NewI = ShrinkOperand(CI->getOperand(0));
3574           break;
3575         case Instruction::SExt:
3576           NewI = B.CreateSExtOrTrunc(
3577               CI->getOperand(0),
3578               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3579           break;
3580         case Instruction::ZExt:
3581           NewI = B.CreateZExtOrTrunc(
3582               CI->getOperand(0),
3583               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3584           break;
3585         }
3586       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3587         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3588                              ->getNumElements();
3589         auto *O0 = B.CreateZExtOrTrunc(
3590             SI->getOperand(0),
3591             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3592         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3593                              ->getNumElements();
3594         auto *O1 = B.CreateZExtOrTrunc(
3595             SI->getOperand(1),
3596             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3597 
3598         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3599       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3600         // Don't do anything with the operands, just extend the result.
3601         continue;
3602       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3603         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3604                             ->getNumElements();
3605         auto *O0 = B.CreateZExtOrTrunc(
3606             IE->getOperand(0),
3607             FixedVectorType::get(ScalarTruncatedTy, Elements));
3608         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3609         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3610       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3611         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3612                             ->getNumElements();
3613         auto *O0 = B.CreateZExtOrTrunc(
3614             EE->getOperand(0),
3615             FixedVectorType::get(ScalarTruncatedTy, Elements));
3616         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3617       } else {
3618         // If we don't know what to do, be conservative and don't do anything.
3619         continue;
3620       }
3621 
3622       // Lastly, extend the result.
3623       NewI->takeName(cast<Instruction>(I));
3624       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3625       I->replaceAllUsesWith(Res);
3626       cast<Instruction>(I)->eraseFromParent();
3627       Erased.insert(I);
3628       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3629     }
3630   }
3631 
3632   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3633   for (const auto &KV : Cost->getMinimalBitwidths()) {
3634     // If the value wasn't vectorized, we must maintain the original scalar
3635     // type. The absence of the value from VectorLoopValueMap indicates that it
3636     // wasn't vectorized.
3637     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3638       continue;
3639     for (unsigned Part = 0; Part < UF; ++Part) {
3640       Value *I = getOrCreateVectorValue(KV.first, Part);
3641       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3642       if (Inst && Inst->use_empty()) {
3643         Value *NewI = Inst->getOperand(0);
3644         Inst->eraseFromParent();
3645         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3646       }
3647     }
3648   }
3649 }
3650 
3651 void InnerLoopVectorizer::fixVectorizedLoop() {
3652   // Insert truncates and extends for any truncated instructions as hints to
3653   // InstCombine.
3654   if (VF.isVector())
3655     truncateToMinimalBitwidths();
3656 
3657   // Fix widened non-induction PHIs by setting up the PHI operands.
3658   if (OrigPHIsToFix.size()) {
3659     assert(EnableVPlanNativePath &&
3660            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3661     fixNonInductionPHIs();
3662   }
3663 
3664   // At this point every instruction in the original loop is widened to a
3665   // vector form. Now we need to fix the recurrences in the loop. These PHI
3666   // nodes are currently empty because we did not want to introduce cycles.
3667   // This is the second stage of vectorizing recurrences.
3668   fixCrossIterationPHIs();
3669 
3670   // Forget the original basic block.
3671   PSE.getSE()->forgetLoop(OrigLoop);
3672 
3673   // Fix-up external users of the induction variables.
3674   for (auto &Entry : Legal->getInductionVars())
3675     fixupIVUsers(Entry.first, Entry.second,
3676                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3677                  IVEndValues[Entry.first], LoopMiddleBlock);
3678 
3679   fixLCSSAPHIs();
3680   for (Instruction *PI : PredicatedInstructions)
3681     sinkScalarOperands(&*PI);
3682 
3683   // Remove redundant induction instructions.
3684   cse(LoopVectorBody);
3685 
3686   // Set/update profile weights for the vector and remainder loops as original
3687   // loop iterations are now distributed among them. Note that original loop
3688   // represented by LoopScalarBody becomes remainder loop after vectorization.
3689   //
3690   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3691   // end up getting slightly roughened result but that should be OK since
3692   // profile is not inherently precise anyway. Note also possible bypass of
3693   // vector code caused by legality checks is ignored, assigning all the weight
3694   // to the vector loop, optimistically.
3695   assert(!VF.isScalable() &&
3696          "cannot use scalable ElementCount to determine unroll factor");
3697   setProfileInfoAfterUnrolling(
3698       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3699       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3700 }
3701 
3702 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3703   // In order to support recurrences we need to be able to vectorize Phi nodes.
3704   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3705   // stage #2: We now need to fix the recurrences by adding incoming edges to
3706   // the currently empty PHI nodes. At this point every instruction in the
3707   // original loop is widened to a vector form so we can use them to construct
3708   // the incoming edges.
3709   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3710     // Handle first-order recurrences and reductions that need to be fixed.
3711     if (Legal->isFirstOrderRecurrence(&Phi))
3712       fixFirstOrderRecurrence(&Phi);
3713     else if (Legal->isReductionVariable(&Phi))
3714       fixReduction(&Phi);
3715   }
3716 }
3717 
3718 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3719   // This is the second phase of vectorizing first-order recurrences. An
3720   // overview of the transformation is described below. Suppose we have the
3721   // following loop.
3722   //
3723   //   for (int i = 0; i < n; ++i)
3724   //     b[i] = a[i] - a[i - 1];
3725   //
3726   // There is a first-order recurrence on "a". For this loop, the shorthand
3727   // scalar IR looks like:
3728   //
3729   //   scalar.ph:
3730   //     s_init = a[-1]
3731   //     br scalar.body
3732   //
3733   //   scalar.body:
3734   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3735   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3736   //     s2 = a[i]
3737   //     b[i] = s2 - s1
3738   //     br cond, scalar.body, ...
3739   //
3740   // In this example, s1 is a recurrence because it's value depends on the
3741   // previous iteration. In the first phase of vectorization, we created a
3742   // temporary value for s1. We now complete the vectorization and produce the
3743   // shorthand vector IR shown below (for VF = 4, UF = 1).
3744   //
3745   //   vector.ph:
3746   //     v_init = vector(..., ..., ..., a[-1])
3747   //     br vector.body
3748   //
3749   //   vector.body
3750   //     i = phi [0, vector.ph], [i+4, vector.body]
3751   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3752   //     v2 = a[i, i+1, i+2, i+3];
3753   //     v3 = vector(v1(3), v2(0, 1, 2))
3754   //     b[i, i+1, i+2, i+3] = v2 - v3
3755   //     br cond, vector.body, middle.block
3756   //
3757   //   middle.block:
3758   //     x = v2(3)
3759   //     br scalar.ph
3760   //
3761   //   scalar.ph:
3762   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3763   //     br scalar.body
3764   //
3765   // After execution completes the vector loop, we extract the next value of
3766   // the recurrence (x) to use as the initial value in the scalar loop.
3767 
3768   // Get the original loop preheader and single loop latch.
3769   auto *Preheader = OrigLoop->getLoopPreheader();
3770   auto *Latch = OrigLoop->getLoopLatch();
3771 
3772   // Get the initial and previous values of the scalar recurrence.
3773   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3774   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3775 
3776   // Create a vector from the initial value.
3777   auto *VectorInit = ScalarInit;
3778   if (VF.isVector()) {
3779     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3780     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
3781     VectorInit = Builder.CreateInsertElement(
3782         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3783         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
3784   }
3785 
3786   // We constructed a temporary phi node in the first phase of vectorization.
3787   // This phi node will eventually be deleted.
3788   Builder.SetInsertPoint(
3789       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3790 
3791   // Create a phi node for the new recurrence. The current value will either be
3792   // the initial value inserted into a vector or loop-varying vector value.
3793   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3794   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3795 
3796   // Get the vectorized previous value of the last part UF - 1. It appears last
3797   // among all unrolled iterations, due to the order of their construction.
3798   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3799 
3800   // Find and set the insertion point after the previous value if it is an
3801   // instruction.
3802   BasicBlock::iterator InsertPt;
3803   // Note that the previous value may have been constant-folded so it is not
3804   // guaranteed to be an instruction in the vector loop.
3805   // FIXME: Loop invariant values do not form recurrences. We should deal with
3806   //        them earlier.
3807   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3808     InsertPt = LoopVectorBody->getFirstInsertionPt();
3809   else {
3810     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3811     if (isa<PHINode>(PreviousLastPart))
3812       // If the previous value is a phi node, we should insert after all the phi
3813       // nodes in the block containing the PHI to avoid breaking basic block
3814       // verification. Note that the basic block may be different to
3815       // LoopVectorBody, in case we predicate the loop.
3816       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3817     else
3818       InsertPt = ++PreviousInst->getIterator();
3819   }
3820   Builder.SetInsertPoint(&*InsertPt);
3821 
3822   // We will construct a vector for the recurrence by combining the values for
3823   // the current and previous iterations. This is the required shuffle mask.
3824   assert(!VF.isScalable());
3825   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
3826   ShuffleMask[0] = VF.getKnownMinValue() - 1;
3827   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
3828     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
3829 
3830   // The vector from which to take the initial value for the current iteration
3831   // (actual or unrolled). Initially, this is the vector phi node.
3832   Value *Incoming = VecPhi;
3833 
3834   // Shuffle the current and previous vector and update the vector parts.
3835   for (unsigned Part = 0; Part < UF; ++Part) {
3836     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3837     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3838     auto *Shuffle =
3839         VF.isVector()
3840             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
3841             : Incoming;
3842     PhiPart->replaceAllUsesWith(Shuffle);
3843     cast<Instruction>(PhiPart)->eraseFromParent();
3844     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3845     Incoming = PreviousPart;
3846   }
3847 
3848   // Fix the latch value of the new recurrence in the vector loop.
3849   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3850 
3851   // Extract the last vector element in the middle block. This will be the
3852   // initial value for the recurrence when jumping to the scalar loop.
3853   auto *ExtractForScalar = Incoming;
3854   if (VF.isVector()) {
3855     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3856     ExtractForScalar = Builder.CreateExtractElement(
3857         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
3858         "vector.recur.extract");
3859   }
3860   // Extract the second last element in the middle block if the
3861   // Phi is used outside the loop. We need to extract the phi itself
3862   // and not the last element (the phi update in the current iteration). This
3863   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3864   // when the scalar loop is not run at all.
3865   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3866   if (VF.isVector())
3867     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3868         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
3869         "vector.recur.extract.for.phi");
3870   // When loop is unrolled without vectorizing, initialize
3871   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3872   // `Incoming`. This is analogous to the vectorized case above: extracting the
3873   // second last element when VF > 1.
3874   else if (UF > 1)
3875     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3876 
3877   // Fix the initial value of the original recurrence in the scalar loop.
3878   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3879   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3880   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3881     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3882     Start->addIncoming(Incoming, BB);
3883   }
3884 
3885   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3886   Phi->setName("scalar.recur");
3887 
3888   // Finally, fix users of the recurrence outside the loop. The users will need
3889   // either the last value of the scalar recurrence or the last value of the
3890   // vector recurrence we extracted in the middle block. Since the loop is in
3891   // LCSSA form, we just need to find all the phi nodes for the original scalar
3892   // recurrence in the exit block, and then add an edge for the middle block.
3893   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3894     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3895       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3896     }
3897   }
3898 }
3899 
3900 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3901   Constant *Zero = Builder.getInt32(0);
3902 
3903   // Get it's reduction variable descriptor.
3904   assert(Legal->isReductionVariable(Phi) &&
3905          "Unable to find the reduction variable");
3906   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3907 
3908   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3909   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3910   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3911   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3912     RdxDesc.getMinMaxRecurrenceKind();
3913   setDebugLocFromInst(Builder, ReductionStartValue);
3914   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
3915 
3916   // We need to generate a reduction vector from the incoming scalar.
3917   // To do so, we need to generate the 'identity' vector and override
3918   // one of the elements with the incoming scalar reduction. We need
3919   // to do it in the vector-loop preheader.
3920   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3921 
3922   // This is the vector-clone of the value that leaves the loop.
3923   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3924 
3925   // Find the reduction identity variable. Zero for addition, or, xor,
3926   // one for multiplication, -1 for And.
3927   Value *Identity;
3928   Value *VectorStart;
3929   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3930       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3931     // MinMax reduction have the start value as their identify.
3932     if (VF == 1 || IsInLoopReductionPhi) {
3933       VectorStart = Identity = ReductionStartValue;
3934     } else {
3935       VectorStart = Identity =
3936         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3937     }
3938   } else {
3939     // Handle other reduction kinds:
3940     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3941         RK, VecTy->getScalarType());
3942     if (VF == 1 || IsInLoopReductionPhi) {
3943       Identity = Iden;
3944       // This vector is the Identity vector where the first element is the
3945       // incoming scalar reduction.
3946       VectorStart = ReductionStartValue;
3947     } else {
3948       Identity = ConstantVector::getSplat(VF, Iden);
3949 
3950       // This vector is the Identity vector where the first element is the
3951       // incoming scalar reduction.
3952       VectorStart =
3953         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3954     }
3955   }
3956 
3957   // Wrap flags are in general invalid after vectorization, clear them.
3958   clearReductionWrapFlags(RdxDesc);
3959 
3960   // Fix the vector-loop phi.
3961 
3962   // Reductions do not have to start at zero. They can start with
3963   // any loop invariant values.
3964   BasicBlock *Latch = OrigLoop->getLoopLatch();
3965   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3966 
3967   for (unsigned Part = 0; Part < UF; ++Part) {
3968     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3969     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3970     // Make sure to add the reduction start value only to the
3971     // first unroll part.
3972     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3973     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3974     cast<PHINode>(VecRdxPhi)
3975       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3976   }
3977 
3978   // Before each round, move the insertion point right between
3979   // the PHIs and the values we are going to write.
3980   // This allows us to write both PHINodes and the extractelement
3981   // instructions.
3982   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3983 
3984   setDebugLocFromInst(Builder, LoopExitInst);
3985 
3986   // If tail is folded by masking, the vector value to leave the loop should be
3987   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3988   // instead of the former.
3989   if (Cost->foldTailByMasking()) {
3990     for (unsigned Part = 0; Part < UF; ++Part) {
3991       Value *VecLoopExitInst =
3992           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3993       Value *Sel = nullptr;
3994       for (User *U : VecLoopExitInst->users()) {
3995         if (isa<SelectInst>(U)) {
3996           assert(!Sel && "Reduction exit feeding two selects");
3997           Sel = U;
3998         } else
3999           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4000       }
4001       assert(Sel && "Reduction exit feeds no select");
4002       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4003 
4004       // If the target can create a predicated operator for the reduction at no
4005       // extra cost in the loop (for example a predicated vadd), it can be
4006       // cheaper for the select to remain in the loop than be sunk out of it,
4007       // and so use the select value for the phi instead of the old
4008       // LoopExitValue.
4009       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4010       if (PreferPredicatedReductionSelect ||
4011           TTI->preferPredicatedReductionSelect(
4012               RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()),
4013               Phi->getType(), TargetTransformInfo::ReductionFlags())) {
4014         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4015         VecRdxPhi->setIncomingValueForBlock(
4016             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4017       }
4018     }
4019   }
4020 
4021   // If the vector reduction can be performed in a smaller type, we truncate
4022   // then extend the loop exit value to enable InstCombine to evaluate the
4023   // entire expression in the smaller type.
4024   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4025     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4026     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4027     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4028     Builder.SetInsertPoint(
4029         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4030     VectorParts RdxParts(UF);
4031     for (unsigned Part = 0; Part < UF; ++Part) {
4032       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4033       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4034       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4035                                         : Builder.CreateZExt(Trunc, VecTy);
4036       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4037            UI != RdxParts[Part]->user_end();)
4038         if (*UI != Trunc) {
4039           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4040           RdxParts[Part] = Extnd;
4041         } else {
4042           ++UI;
4043         }
4044     }
4045     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4046     for (unsigned Part = 0; Part < UF; ++Part) {
4047       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4048       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4049     }
4050   }
4051 
4052   // Reduce all of the unrolled parts into a single vector.
4053   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4054   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4055 
4056   // The middle block terminator has already been assigned a DebugLoc here (the
4057   // OrigLoop's single latch terminator). We want the whole middle block to
4058   // appear to execute on this line because: (a) it is all compiler generated,
4059   // (b) these instructions are always executed after evaluating the latch
4060   // conditional branch, and (c) other passes may add new predecessors which
4061   // terminate on this line. This is the easiest way to ensure we don't
4062   // accidentally cause an extra step back into the loop while debugging.
4063   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4064   for (unsigned Part = 1; Part < UF; ++Part) {
4065     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4066     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4067       // Floating point operations had to be 'fast' to enable the reduction.
4068       ReducedPartRdx = addFastMathFlag(
4069           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4070                               ReducedPartRdx, "bin.rdx"),
4071           RdxDesc.getFastMathFlags());
4072     else
4073       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4074                                       RdxPart);
4075   }
4076 
4077   // Create the reduction after the loop. Note that inloop reductions create the
4078   // target reduction in the loop using a Reduction recipe.
4079   if (VF.isVector() && !IsInLoopReductionPhi) {
4080     bool NoNaN = Legal->hasFunNoNaNAttr();
4081     ReducedPartRdx =
4082         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4083     // If the reduction can be performed in a smaller type, we need to extend
4084     // the reduction to the wider type before we branch to the original loop.
4085     if (Phi->getType() != RdxDesc.getRecurrenceType())
4086       ReducedPartRdx =
4087         RdxDesc.isSigned()
4088         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4089         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4090   }
4091 
4092   // Create a phi node that merges control-flow from the backedge-taken check
4093   // block and the middle block.
4094   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4095                                         LoopScalarPreHeader->getTerminator());
4096   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4097     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4098   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4099 
4100   // Now, we need to fix the users of the reduction variable
4101   // inside and outside of the scalar remainder loop.
4102   // We know that the loop is in LCSSA form. We need to update the
4103   // PHI nodes in the exit blocks.
4104   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4105     // All PHINodes need to have a single entry edge, or two if
4106     // we already fixed them.
4107     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4108 
4109     // We found a reduction value exit-PHI. Update it with the
4110     // incoming bypass edge.
4111     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4112       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4113   } // end of the LCSSA phi scan.
4114 
4115     // Fix the scalar loop reduction variable with the incoming reduction sum
4116     // from the vector body and from the backedge value.
4117   int IncomingEdgeBlockIdx =
4118     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4119   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4120   // Pick the other block.
4121   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4122   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4123   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4124 }
4125 
4126 void InnerLoopVectorizer::clearReductionWrapFlags(
4127     RecurrenceDescriptor &RdxDesc) {
4128   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4129   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4130       RK != RecurrenceDescriptor::RK_IntegerMult)
4131     return;
4132 
4133   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4134   assert(LoopExitInstr && "null loop exit instruction");
4135   SmallVector<Instruction *, 8> Worklist;
4136   SmallPtrSet<Instruction *, 8> Visited;
4137   Worklist.push_back(LoopExitInstr);
4138   Visited.insert(LoopExitInstr);
4139 
4140   while (!Worklist.empty()) {
4141     Instruction *Cur = Worklist.pop_back_val();
4142     if (isa<OverflowingBinaryOperator>(Cur))
4143       for (unsigned Part = 0; Part < UF; ++Part) {
4144         Value *V = getOrCreateVectorValue(Cur, Part);
4145         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4146       }
4147 
4148     for (User *U : Cur->users()) {
4149       Instruction *UI = cast<Instruction>(U);
4150       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4151           Visited.insert(UI).second)
4152         Worklist.push_back(UI);
4153     }
4154   }
4155 }
4156 
4157 void InnerLoopVectorizer::fixLCSSAPHIs() {
4158   assert(!VF.isScalable() && "the code below assumes fixed width vectors");
4159   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4160     if (LCSSAPhi.getNumIncomingValues() == 1) {
4161       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4162       // Non-instruction incoming values will have only one value.
4163       unsigned LastLane = 0;
4164       if (isa<Instruction>(IncomingValue))
4165         LastLane = Cost->isUniformAfterVectorization(
4166                        cast<Instruction>(IncomingValue), VF)
4167                        ? 0
4168                        : VF.getKnownMinValue() - 1;
4169       // Can be a loop invariant incoming value or the last scalar value to be
4170       // extracted from the vectorized loop.
4171       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4172       Value *lastIncomingValue =
4173           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4174       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4175     }
4176   }
4177 }
4178 
4179 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4180   // The basic block and loop containing the predicated instruction.
4181   auto *PredBB = PredInst->getParent();
4182   auto *VectorLoop = LI->getLoopFor(PredBB);
4183 
4184   // Initialize a worklist with the operands of the predicated instruction.
4185   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4186 
4187   // Holds instructions that we need to analyze again. An instruction may be
4188   // reanalyzed if we don't yet know if we can sink it or not.
4189   SmallVector<Instruction *, 8> InstsToReanalyze;
4190 
4191   // Returns true if a given use occurs in the predicated block. Phi nodes use
4192   // their operands in their corresponding predecessor blocks.
4193   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4194     auto *I = cast<Instruction>(U.getUser());
4195     BasicBlock *BB = I->getParent();
4196     if (auto *Phi = dyn_cast<PHINode>(I))
4197       BB = Phi->getIncomingBlock(
4198           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4199     return BB == PredBB;
4200   };
4201 
4202   // Iteratively sink the scalarized operands of the predicated instruction
4203   // into the block we created for it. When an instruction is sunk, it's
4204   // operands are then added to the worklist. The algorithm ends after one pass
4205   // through the worklist doesn't sink a single instruction.
4206   bool Changed;
4207   do {
4208     // Add the instructions that need to be reanalyzed to the worklist, and
4209     // reset the changed indicator.
4210     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4211     InstsToReanalyze.clear();
4212     Changed = false;
4213 
4214     while (!Worklist.empty()) {
4215       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4216 
4217       // We can't sink an instruction if it is a phi node, is already in the
4218       // predicated block, is not in the loop, or may have side effects.
4219       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4220           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4221         continue;
4222 
4223       // It's legal to sink the instruction if all its uses occur in the
4224       // predicated block. Otherwise, there's nothing to do yet, and we may
4225       // need to reanalyze the instruction.
4226       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4227         InstsToReanalyze.push_back(I);
4228         continue;
4229       }
4230 
4231       // Move the instruction to the beginning of the predicated block, and add
4232       // it's operands to the worklist.
4233       I->moveBefore(&*PredBB->getFirstInsertionPt());
4234       Worklist.insert(I->op_begin(), I->op_end());
4235 
4236       // The sinking may have enabled other instructions to be sunk, so we will
4237       // need to iterate.
4238       Changed = true;
4239     }
4240   } while (Changed);
4241 }
4242 
4243 void InnerLoopVectorizer::fixNonInductionPHIs() {
4244   for (PHINode *OrigPhi : OrigPHIsToFix) {
4245     PHINode *NewPhi =
4246         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4247     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4248 
4249     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4250         predecessors(OrigPhi->getParent()));
4251     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4252         predecessors(NewPhi->getParent()));
4253     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4254            "Scalar and Vector BB should have the same number of predecessors");
4255 
4256     // The insertion point in Builder may be invalidated by the time we get
4257     // here. Force the Builder insertion point to something valid so that we do
4258     // not run into issues during insertion point restore in
4259     // getOrCreateVectorValue calls below.
4260     Builder.SetInsertPoint(NewPhi);
4261 
4262     // The predecessor order is preserved and we can rely on mapping between
4263     // scalar and vector block predecessors.
4264     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4265       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4266 
4267       // When looking up the new scalar/vector values to fix up, use incoming
4268       // values from original phi.
4269       Value *ScIncV =
4270           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4271 
4272       // Scalar incoming value may need a broadcast
4273       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4274       NewPhi->addIncoming(NewIncV, NewPredBB);
4275     }
4276   }
4277 }
4278 
4279 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
4280                                    unsigned UF, ElementCount VF,
4281                                    bool IsPtrLoopInvariant,
4282                                    SmallBitVector &IsIndexLoopInvariant,
4283                                    VPTransformState &State) {
4284   // Construct a vector GEP by widening the operands of the scalar GEP as
4285   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4286   // results in a vector of pointers when at least one operand of the GEP
4287   // is vector-typed. Thus, to keep the representation compact, we only use
4288   // vector-typed operands for loop-varying values.
4289 
4290   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4291     // If we are vectorizing, but the GEP has only loop-invariant operands,
4292     // the GEP we build (by only using vector-typed operands for
4293     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4294     // produce a vector of pointers, we need to either arbitrarily pick an
4295     // operand to broadcast, or broadcast a clone of the original GEP.
4296     // Here, we broadcast a clone of the original.
4297     //
4298     // TODO: If at some point we decide to scalarize instructions having
4299     //       loop-invariant operands, this special case will no longer be
4300     //       required. We would add the scalarization decision to
4301     //       collectLoopScalars() and teach getVectorValue() to broadcast
4302     //       the lane-zero scalar value.
4303     auto *Clone = Builder.Insert(GEP->clone());
4304     for (unsigned Part = 0; Part < UF; ++Part) {
4305       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4306       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4307       addMetadata(EntryPart, GEP);
4308     }
4309   } else {
4310     // If the GEP has at least one loop-varying operand, we are sure to
4311     // produce a vector of pointers. But if we are only unrolling, we want
4312     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4313     // produce with the code below will be scalar (if VF == 1) or vector
4314     // (otherwise). Note that for the unroll-only case, we still maintain
4315     // values in the vector mapping with initVector, as we do for other
4316     // instructions.
4317     for (unsigned Part = 0; Part < UF; ++Part) {
4318       // The pointer operand of the new GEP. If it's loop-invariant, we
4319       // won't broadcast it.
4320       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4321                                      : State.get(Operands.getOperand(0), Part);
4322 
4323       // Collect all the indices for the new GEP. If any index is
4324       // loop-invariant, we won't broadcast it.
4325       SmallVector<Value *, 4> Indices;
4326       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4327         VPValue *Operand = Operands.getOperand(I);
4328         if (IsIndexLoopInvariant[I - 1])
4329           Indices.push_back(State.get(Operand, {0, 0}));
4330         else
4331           Indices.push_back(State.get(Operand, Part));
4332       }
4333 
4334       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4335       // but it should be a vector, otherwise.
4336       auto *NewGEP =
4337           GEP->isInBounds()
4338               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4339                                           Indices)
4340               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4341       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4342              "NewGEP is not a pointer vector");
4343       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4344       addMetadata(NewGEP, GEP);
4345     }
4346   }
4347 }
4348 
4349 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4350                                               ElementCount VF) {
4351   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4352   PHINode *P = cast<PHINode>(PN);
4353   if (EnableVPlanNativePath) {
4354     // Currently we enter here in the VPlan-native path for non-induction
4355     // PHIs where all control flow is uniform. We simply widen these PHIs.
4356     // Create a vector phi with no operands - the vector phi operands will be
4357     // set at the end of vector code generation.
4358     Type *VecTy =
4359         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4360     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4361     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4362     OrigPHIsToFix.push_back(P);
4363 
4364     return;
4365   }
4366 
4367   assert(PN->getParent() == OrigLoop->getHeader() &&
4368          "Non-header phis should have been handled elsewhere");
4369 
4370   // In order to support recurrences we need to be able to vectorize Phi nodes.
4371   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4372   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4373   // this value when we vectorize all of the instructions that use the PHI.
4374   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4375     for (unsigned Part = 0; Part < UF; ++Part) {
4376       // This is phase one of vectorizing PHIs.
4377       bool ScalarPHI =
4378           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4379       Type *VecTy =
4380           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4381       Value *EntryPart = PHINode::Create(
4382           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4383       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4384     }
4385     return;
4386   }
4387 
4388   setDebugLocFromInst(Builder, P);
4389 
4390   // This PHINode must be an induction variable.
4391   // Make sure that we know about it.
4392   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4393 
4394   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4395   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4396 
4397   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4398   // which can be found from the original scalar operations.
4399   switch (II.getKind()) {
4400   case InductionDescriptor::IK_NoInduction:
4401     llvm_unreachable("Unknown induction");
4402   case InductionDescriptor::IK_IntInduction:
4403   case InductionDescriptor::IK_FpInduction:
4404     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4405   case InductionDescriptor::IK_PtrInduction: {
4406     // Handle the pointer induction variable case.
4407     assert(P->getType()->isPointerTy() && "Unexpected type.");
4408 
4409     if (Cost->isScalarAfterVectorization(P, VF)) {
4410       // This is the normalized GEP that starts counting at zero.
4411       Value *PtrInd =
4412           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4413       // Determine the number of scalars we need to generate for each unroll
4414       // iteration. If the instruction is uniform, we only need to generate the
4415       // first lane. Otherwise, we generate all VF values.
4416       unsigned Lanes =
4417           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4418       for (unsigned Part = 0; Part < UF; ++Part) {
4419         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4420           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4421                                            Lane + Part * VF.getKnownMinValue());
4422           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4423           Value *SclrGep =
4424               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4425           SclrGep->setName("next.gep");
4426           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4427         }
4428       }
4429       return;
4430     }
4431     assert(isa<SCEVConstant>(II.getStep()) &&
4432            "Induction step not a SCEV constant!");
4433     Type *PhiType = II.getStep()->getType();
4434 
4435     // Build a pointer phi
4436     Value *ScalarStartValue = II.getStartValue();
4437     Type *ScStValueType = ScalarStartValue->getType();
4438     PHINode *NewPointerPhi =
4439         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4440     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4441 
4442     // A pointer induction, performed by using a gep
4443     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4444     Instruction *InductionLoc = LoopLatch->getTerminator();
4445     const SCEV *ScalarStep = II.getStep();
4446     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4447     Value *ScalarStepValue =
4448         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4449     Value *InductionGEP = GetElementPtrInst::Create(
4450         ScStValueType->getPointerElementType(), NewPointerPhi,
4451         Builder.CreateMul(
4452             ScalarStepValue,
4453             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4454         "ptr.ind", InductionLoc);
4455     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4456 
4457     // Create UF many actual address geps that use the pointer
4458     // phi as base and a vectorized version of the step value
4459     // (<step*0, ..., step*N>) as offset.
4460     for (unsigned Part = 0; Part < UF; ++Part) {
4461       SmallVector<Constant *, 8> Indices;
4462       // Create a vector of consecutive numbers from zero to VF.
4463       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4464         Indices.push_back(
4465             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4466       Constant *StartOffset = ConstantVector::get(Indices);
4467 
4468       Value *GEP = Builder.CreateGEP(
4469           ScStValueType->getPointerElementType(), NewPointerPhi,
4470           Builder.CreateMul(
4471               StartOffset,
4472               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4473               "vector.gep"));
4474       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4475     }
4476   }
4477   }
4478 }
4479 
4480 /// A helper function for checking whether an integer division-related
4481 /// instruction may divide by zero (in which case it must be predicated if
4482 /// executed conditionally in the scalar code).
4483 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4484 /// Non-zero divisors that are non compile-time constants will not be
4485 /// converted into multiplication, so we will still end up scalarizing
4486 /// the division, but can do so w/o predication.
4487 static bool mayDivideByZero(Instruction &I) {
4488   assert((I.getOpcode() == Instruction::UDiv ||
4489           I.getOpcode() == Instruction::SDiv ||
4490           I.getOpcode() == Instruction::URem ||
4491           I.getOpcode() == Instruction::SRem) &&
4492          "Unexpected instruction");
4493   Value *Divisor = I.getOperand(1);
4494   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4495   return !CInt || CInt->isZero();
4496 }
4497 
4498 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
4499                                            VPTransformState &State) {
4500   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4501   switch (I.getOpcode()) {
4502   case Instruction::Call:
4503   case Instruction::Br:
4504   case Instruction::PHI:
4505   case Instruction::GetElementPtr:
4506   case Instruction::Select:
4507     llvm_unreachable("This instruction is handled by a different recipe.");
4508   case Instruction::UDiv:
4509   case Instruction::SDiv:
4510   case Instruction::SRem:
4511   case Instruction::URem:
4512   case Instruction::Add:
4513   case Instruction::FAdd:
4514   case Instruction::Sub:
4515   case Instruction::FSub:
4516   case Instruction::FNeg:
4517   case Instruction::Mul:
4518   case Instruction::FMul:
4519   case Instruction::FDiv:
4520   case Instruction::FRem:
4521   case Instruction::Shl:
4522   case Instruction::LShr:
4523   case Instruction::AShr:
4524   case Instruction::And:
4525   case Instruction::Or:
4526   case Instruction::Xor: {
4527     // Just widen unops and binops.
4528     setDebugLocFromInst(Builder, &I);
4529 
4530     for (unsigned Part = 0; Part < UF; ++Part) {
4531       SmallVector<Value *, 2> Ops;
4532       for (VPValue *VPOp : User.operands())
4533         Ops.push_back(State.get(VPOp, Part));
4534 
4535       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4536 
4537       if (auto *VecOp = dyn_cast<Instruction>(V))
4538         VecOp->copyIRFlags(&I);
4539 
4540       // Use this vector value for all users of the original instruction.
4541       VectorLoopValueMap.setVectorValue(&I, Part, V);
4542       addMetadata(V, &I);
4543     }
4544 
4545     break;
4546   }
4547   case Instruction::ICmp:
4548   case Instruction::FCmp: {
4549     // Widen compares. Generate vector compares.
4550     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4551     auto *Cmp = cast<CmpInst>(&I);
4552     setDebugLocFromInst(Builder, Cmp);
4553     for (unsigned Part = 0; Part < UF; ++Part) {
4554       Value *A = State.get(User.getOperand(0), Part);
4555       Value *B = State.get(User.getOperand(1), Part);
4556       Value *C = nullptr;
4557       if (FCmp) {
4558         // Propagate fast math flags.
4559         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4560         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4561         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4562       } else {
4563         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4564       }
4565       VectorLoopValueMap.setVectorValue(&I, Part, C);
4566       addMetadata(C, &I);
4567     }
4568 
4569     break;
4570   }
4571 
4572   case Instruction::ZExt:
4573   case Instruction::SExt:
4574   case Instruction::FPToUI:
4575   case Instruction::FPToSI:
4576   case Instruction::FPExt:
4577   case Instruction::PtrToInt:
4578   case Instruction::IntToPtr:
4579   case Instruction::SIToFP:
4580   case Instruction::UIToFP:
4581   case Instruction::Trunc:
4582   case Instruction::FPTrunc:
4583   case Instruction::BitCast: {
4584     auto *CI = cast<CastInst>(&I);
4585     setDebugLocFromInst(Builder, CI);
4586 
4587     /// Vectorize casts.
4588     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4589     Type *DestTy =
4590         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4591 
4592     for (unsigned Part = 0; Part < UF; ++Part) {
4593       Value *A = State.get(User.getOperand(0), Part);
4594       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4595       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4596       addMetadata(Cast, &I);
4597     }
4598     break;
4599   }
4600   default:
4601     // This instruction is not vectorized by simple widening.
4602     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4603     llvm_unreachable("Unhandled instruction!");
4604   } // end of switch.
4605 }
4606 
4607 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4608                                                VPTransformState &State) {
4609   assert(!isa<DbgInfoIntrinsic>(I) &&
4610          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4611   setDebugLocFromInst(Builder, &I);
4612 
4613   Module *M = I.getParent()->getParent()->getParent();
4614   auto *CI = cast<CallInst>(&I);
4615 
4616   SmallVector<Type *, 4> Tys;
4617   for (Value *ArgOperand : CI->arg_operands())
4618     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4619 
4620   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4621 
4622   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4623   // version of the instruction.
4624   // Is it beneficial to perform intrinsic call compared to lib call?
4625   bool NeedToScalarize = false;
4626   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4627   bool UseVectorIntrinsic =
4628       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4629   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4630          "Instruction should be scalarized elsewhere.");
4631 
4632   for (unsigned Part = 0; Part < UF; ++Part) {
4633     SmallVector<Value *, 4> Args;
4634     for (auto &I : enumerate(ArgOperands.operands())) {
4635       // Some intrinsics have a scalar argument - don't replace it with a
4636       // vector.
4637       Value *Arg;
4638       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4639         Arg = State.get(I.value(), Part);
4640       else
4641         Arg = State.get(I.value(), {0, 0});
4642       Args.push_back(Arg);
4643     }
4644 
4645     Function *VectorF;
4646     if (UseVectorIntrinsic) {
4647       // Use vector version of the intrinsic.
4648       Type *TysForDecl[] = {CI->getType()};
4649       if (VF.isVector()) {
4650         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4651         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4652       }
4653       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4654       assert(VectorF && "Can't retrieve vector intrinsic.");
4655     } else {
4656       // Use vector version of the function call.
4657       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4658 #ifndef NDEBUG
4659       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4660              "Can't create vector function.");
4661 #endif
4662         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4663     }
4664       SmallVector<OperandBundleDef, 1> OpBundles;
4665       CI->getOperandBundlesAsDefs(OpBundles);
4666       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4667 
4668       if (isa<FPMathOperator>(V))
4669         V->copyFastMathFlags(CI);
4670 
4671       VectorLoopValueMap.setVectorValue(&I, Part, V);
4672       addMetadata(V, &I);
4673   }
4674 }
4675 
4676 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4677                                                  VPUser &Operands,
4678                                                  bool InvariantCond,
4679                                                  VPTransformState &State) {
4680   setDebugLocFromInst(Builder, &I);
4681 
4682   // The condition can be loop invariant  but still defined inside the
4683   // loop. This means that we can't just use the original 'cond' value.
4684   // We have to take the 'vectorized' value and pick the first lane.
4685   // Instcombine will make this a no-op.
4686   auto *InvarCond =
4687       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4688 
4689   for (unsigned Part = 0; Part < UF; ++Part) {
4690     Value *Cond =
4691         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4692     Value *Op0 = State.get(Operands.getOperand(1), Part);
4693     Value *Op1 = State.get(Operands.getOperand(2), Part);
4694     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4695     VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4696     addMetadata(Sel, &I);
4697   }
4698 }
4699 
4700 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4701   // We should not collect Scalars more than once per VF. Right now, this
4702   // function is called from collectUniformsAndScalars(), which already does
4703   // this check. Collecting Scalars for VF=1 does not make any sense.
4704   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4705          "This function should not be visited twice for the same VF");
4706 
4707   SmallSetVector<Instruction *, 8> Worklist;
4708 
4709   // These sets are used to seed the analysis with pointers used by memory
4710   // accesses that will remain scalar.
4711   SmallSetVector<Instruction *, 8> ScalarPtrs;
4712   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4713   auto *Latch = TheLoop->getLoopLatch();
4714 
4715   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4716   // The pointer operands of loads and stores will be scalar as long as the
4717   // memory access is not a gather or scatter operation. The value operand of a
4718   // store will remain scalar if the store is scalarized.
4719   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4720     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4721     assert(WideningDecision != CM_Unknown &&
4722            "Widening decision should be ready at this moment");
4723     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4724       if (Ptr == Store->getValueOperand())
4725         return WideningDecision == CM_Scalarize;
4726     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4727            "Ptr is neither a value or pointer operand");
4728     return WideningDecision != CM_GatherScatter;
4729   };
4730 
4731   // A helper that returns true if the given value is a bitcast or
4732   // getelementptr instruction contained in the loop.
4733   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4734     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4735             isa<GetElementPtrInst>(V)) &&
4736            !TheLoop->isLoopInvariant(V);
4737   };
4738 
4739   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4740     if (!isa<PHINode>(Ptr) ||
4741         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4742       return false;
4743     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4744     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4745       return false;
4746     return isScalarUse(MemAccess, Ptr);
4747   };
4748 
4749   // A helper that evaluates a memory access's use of a pointer. If the
4750   // pointer is actually the pointer induction of a loop, it is being
4751   // inserted into Worklist. If the use will be a scalar use, and the
4752   // pointer is only used by memory accesses, we place the pointer in
4753   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4754   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4755     if (isScalarPtrInduction(MemAccess, Ptr)) {
4756       Worklist.insert(cast<Instruction>(Ptr));
4757       Instruction *Update = cast<Instruction>(
4758           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4759       Worklist.insert(Update);
4760       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4761                         << "\n");
4762       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4763                         << "\n");
4764       return;
4765     }
4766     // We only care about bitcast and getelementptr instructions contained in
4767     // the loop.
4768     if (!isLoopVaryingBitCastOrGEP(Ptr))
4769       return;
4770 
4771     // If the pointer has already been identified as scalar (e.g., if it was
4772     // also identified as uniform), there's nothing to do.
4773     auto *I = cast<Instruction>(Ptr);
4774     if (Worklist.count(I))
4775       return;
4776 
4777     // If the use of the pointer will be a scalar use, and all users of the
4778     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4779     // place the pointer in PossibleNonScalarPtrs.
4780     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4781           return isa<LoadInst>(U) || isa<StoreInst>(U);
4782         }))
4783       ScalarPtrs.insert(I);
4784     else
4785       PossibleNonScalarPtrs.insert(I);
4786   };
4787 
4788   // We seed the scalars analysis with three classes of instructions: (1)
4789   // instructions marked uniform-after-vectorization and (2) bitcast,
4790   // getelementptr and (pointer) phi instructions used by memory accesses
4791   // requiring a scalar use.
4792   //
4793   // (1) Add to the worklist all instructions that have been identified as
4794   // uniform-after-vectorization.
4795   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4796 
4797   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4798   // memory accesses requiring a scalar use. The pointer operands of loads and
4799   // stores will be scalar as long as the memory accesses is not a gather or
4800   // scatter operation. The value operand of a store will remain scalar if the
4801   // store is scalarized.
4802   for (auto *BB : TheLoop->blocks())
4803     for (auto &I : *BB) {
4804       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4805         evaluatePtrUse(Load, Load->getPointerOperand());
4806       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4807         evaluatePtrUse(Store, Store->getPointerOperand());
4808         evaluatePtrUse(Store, Store->getValueOperand());
4809       }
4810     }
4811   for (auto *I : ScalarPtrs)
4812     if (!PossibleNonScalarPtrs.count(I)) {
4813       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4814       Worklist.insert(I);
4815     }
4816 
4817   // Insert the forced scalars.
4818   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4819   // induction variable when the PHI user is scalarized.
4820   auto ForcedScalar = ForcedScalars.find(VF);
4821   if (ForcedScalar != ForcedScalars.end())
4822     for (auto *I : ForcedScalar->second)
4823       Worklist.insert(I);
4824 
4825   // Expand the worklist by looking through any bitcasts and getelementptr
4826   // instructions we've already identified as scalar. This is similar to the
4827   // expansion step in collectLoopUniforms(); however, here we're only
4828   // expanding to include additional bitcasts and getelementptr instructions.
4829   unsigned Idx = 0;
4830   while (Idx != Worklist.size()) {
4831     Instruction *Dst = Worklist[Idx++];
4832     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4833       continue;
4834     auto *Src = cast<Instruction>(Dst->getOperand(0));
4835     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4836           auto *J = cast<Instruction>(U);
4837           return !TheLoop->contains(J) || Worklist.count(J) ||
4838                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4839                   isScalarUse(J, Src));
4840         })) {
4841       Worklist.insert(Src);
4842       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4843     }
4844   }
4845 
4846   // An induction variable will remain scalar if all users of the induction
4847   // variable and induction variable update remain scalar.
4848   for (auto &Induction : Legal->getInductionVars()) {
4849     auto *Ind = Induction.first;
4850     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4851 
4852     // If tail-folding is applied, the primary induction variable will be used
4853     // to feed a vector compare.
4854     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4855       continue;
4856 
4857     // Determine if all users of the induction variable are scalar after
4858     // vectorization.
4859     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4860       auto *I = cast<Instruction>(U);
4861       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4862     });
4863     if (!ScalarInd)
4864       continue;
4865 
4866     // Determine if all users of the induction variable update instruction are
4867     // scalar after vectorization.
4868     auto ScalarIndUpdate =
4869         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4870           auto *I = cast<Instruction>(U);
4871           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4872         });
4873     if (!ScalarIndUpdate)
4874       continue;
4875 
4876     // The induction variable and its update instruction will remain scalar.
4877     Worklist.insert(Ind);
4878     Worklist.insert(IndUpdate);
4879     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4880     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4881                       << "\n");
4882   }
4883 
4884   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4885 }
4886 
4887 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
4888                                                          ElementCount VF) {
4889   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4890   if (!blockNeedsPredication(I->getParent()))
4891     return false;
4892   switch(I->getOpcode()) {
4893   default:
4894     break;
4895   case Instruction::Load:
4896   case Instruction::Store: {
4897     if (!Legal->isMaskRequired(I))
4898       return false;
4899     auto *Ptr = getLoadStorePointerOperand(I);
4900     auto *Ty = getMemInstValueType(I);
4901     // We have already decided how to vectorize this instruction, get that
4902     // result.
4903     if (VF.isVector()) {
4904       InstWidening WideningDecision = getWideningDecision(I, VF);
4905       assert(WideningDecision != CM_Unknown &&
4906              "Widening decision should be ready at this moment");
4907       return WideningDecision == CM_Scalarize;
4908     }
4909     const Align Alignment = getLoadStoreAlignment(I);
4910     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4911                                 isLegalMaskedGather(Ty, Alignment))
4912                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4913                                 isLegalMaskedScatter(Ty, Alignment));
4914   }
4915   case Instruction::UDiv:
4916   case Instruction::SDiv:
4917   case Instruction::SRem:
4918   case Instruction::URem:
4919     return mayDivideByZero(*I);
4920   }
4921   return false;
4922 }
4923 
4924 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4925     Instruction *I, ElementCount VF) {
4926   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4927   assert(getWideningDecision(I, VF) == CM_Unknown &&
4928          "Decision should not be set yet.");
4929   auto *Group = getInterleavedAccessGroup(I);
4930   assert(Group && "Must have a group.");
4931 
4932   // If the instruction's allocated size doesn't equal it's type size, it
4933   // requires padding and will be scalarized.
4934   auto &DL = I->getModule()->getDataLayout();
4935   auto *ScalarTy = getMemInstValueType(I);
4936   if (hasIrregularType(ScalarTy, DL, VF))
4937     return false;
4938 
4939   // Check if masking is required.
4940   // A Group may need masking for one of two reasons: it resides in a block that
4941   // needs predication, or it was decided to use masking to deal with gaps.
4942   bool PredicatedAccessRequiresMasking =
4943       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4944   bool AccessWithGapsRequiresMasking =
4945       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4946   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4947     return true;
4948 
4949   // If masked interleaving is required, we expect that the user/target had
4950   // enabled it, because otherwise it either wouldn't have been created or
4951   // it should have been invalidated by the CostModel.
4952   assert(useMaskedInterleavedAccesses(TTI) &&
4953          "Masked interleave-groups for predicated accesses are not enabled.");
4954 
4955   auto *Ty = getMemInstValueType(I);
4956   const Align Alignment = getLoadStoreAlignment(I);
4957   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4958                           : TTI.isLegalMaskedStore(Ty, Alignment);
4959 }
4960 
4961 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4962     Instruction *I, ElementCount VF) {
4963   // Get and ensure we have a valid memory instruction.
4964   LoadInst *LI = dyn_cast<LoadInst>(I);
4965   StoreInst *SI = dyn_cast<StoreInst>(I);
4966   assert((LI || SI) && "Invalid memory instruction");
4967 
4968   auto *Ptr = getLoadStorePointerOperand(I);
4969 
4970   // In order to be widened, the pointer should be consecutive, first of all.
4971   if (!Legal->isConsecutivePtr(Ptr))
4972     return false;
4973 
4974   // If the instruction is a store located in a predicated block, it will be
4975   // scalarized.
4976   if (isScalarWithPredication(I))
4977     return false;
4978 
4979   // If the instruction's allocated size doesn't equal it's type size, it
4980   // requires padding and will be scalarized.
4981   auto &DL = I->getModule()->getDataLayout();
4982   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4983   if (hasIrregularType(ScalarTy, DL, VF))
4984     return false;
4985 
4986   return true;
4987 }
4988 
4989 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4990   // We should not collect Uniforms more than once per VF. Right now,
4991   // this function is called from collectUniformsAndScalars(), which
4992   // already does this check. Collecting Uniforms for VF=1 does not make any
4993   // sense.
4994 
4995   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4996          "This function should not be visited twice for the same VF");
4997 
4998   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4999   // not analyze again.  Uniforms.count(VF) will return 1.
5000   Uniforms[VF].clear();
5001 
5002   // We now know that the loop is vectorizable!
5003   // Collect instructions inside the loop that will remain uniform after
5004   // vectorization.
5005 
5006   // Global values, params and instructions outside of current loop are out of
5007   // scope.
5008   auto isOutOfScope = [&](Value *V) -> bool {
5009     Instruction *I = dyn_cast<Instruction>(V);
5010     return (!I || !TheLoop->contains(I));
5011   };
5012 
5013   SetVector<Instruction *> Worklist;
5014   BasicBlock *Latch = TheLoop->getLoopLatch();
5015 
5016   // Instructions that are scalar with predication must not be considered
5017   // uniform after vectorization, because that would create an erroneous
5018   // replicating region where only a single instance out of VF should be formed.
5019   // TODO: optimize such seldom cases if found important, see PR40816.
5020   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5021     if (isScalarWithPredication(I, VF)) {
5022       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5023                         << *I << "\n");
5024       return;
5025     }
5026     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5027     Worklist.insert(I);
5028   };
5029 
5030   // Start with the conditional branch. If the branch condition is an
5031   // instruction contained in the loop that is only used by the branch, it is
5032   // uniform.
5033   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5034   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5035     addToWorklistIfAllowed(Cmp);
5036 
5037   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5038   // are pointers that are treated like consecutive pointers during
5039   // vectorization. The pointer operands of interleaved accesses are an
5040   // example.
5041   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
5042 
5043   // Holds pointer operands of instructions that are possibly non-uniform.
5044   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
5045 
5046   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5047     InstWidening WideningDecision = getWideningDecision(I, VF);
5048     assert(WideningDecision != CM_Unknown &&
5049            "Widening decision should be ready at this moment");
5050 
5051     return (WideningDecision == CM_Widen ||
5052             WideningDecision == CM_Widen_Reverse ||
5053             WideningDecision == CM_Interleave);
5054   };
5055   // Iterate over the instructions in the loop, and collect all
5056   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5057   // that a consecutive-like pointer operand will be scalarized, we collect it
5058   // in PossibleNonUniformPtrs instead. We use two sets here because a single
5059   // getelementptr instruction can be used by both vectorized and scalarized
5060   // memory instructions. For example, if a loop loads and stores from the same
5061   // location, but the store is conditional, the store will be scalarized, and
5062   // the getelementptr won't remain uniform.
5063   for (auto *BB : TheLoop->blocks())
5064     for (auto &I : *BB) {
5065       // If there's no pointer operand, there's nothing to do.
5066       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5067       if (!Ptr)
5068         continue;
5069 
5070       // True if all users of Ptr are memory accesses that have Ptr as their
5071       // pointer operand.
5072       auto UsersAreMemAccesses =
5073           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
5074             return getLoadStorePointerOperand(U) == Ptr;
5075           });
5076 
5077       // Ensure the memory instruction will not be scalarized or used by
5078       // gather/scatter, making its pointer operand non-uniform. If the pointer
5079       // operand is used by any instruction other than a memory access, we
5080       // conservatively assume the pointer operand may be non-uniform.
5081       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
5082         PossibleNonUniformPtrs.insert(Ptr);
5083 
5084       // If the memory instruction will be vectorized and its pointer operand
5085       // is consecutive-like, or interleaving - the pointer operand should
5086       // remain uniform.
5087       else
5088         ConsecutiveLikePtrs.insert(Ptr);
5089     }
5090 
5091   // Add to the Worklist all consecutive and consecutive-like pointers that
5092   // aren't also identified as possibly non-uniform.
5093   for (auto *V : ConsecutiveLikePtrs)
5094     if (!PossibleNonUniformPtrs.count(V))
5095       addToWorklistIfAllowed(V);
5096 
5097   // Expand Worklist in topological order: whenever a new instruction
5098   // is added , its users should be already inside Worklist.  It ensures
5099   // a uniform instruction will only be used by uniform instructions.
5100   unsigned idx = 0;
5101   while (idx != Worklist.size()) {
5102     Instruction *I = Worklist[idx++];
5103 
5104     for (auto OV : I->operand_values()) {
5105       // isOutOfScope operands cannot be uniform instructions.
5106       if (isOutOfScope(OV))
5107         continue;
5108       // First order recurrence Phi's should typically be considered
5109       // non-uniform.
5110       auto *OP = dyn_cast<PHINode>(OV);
5111       if (OP && Legal->isFirstOrderRecurrence(OP))
5112         continue;
5113       // If all the users of the operand are uniform, then add the
5114       // operand into the uniform worklist.
5115       auto *OI = cast<Instruction>(OV);
5116       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5117             auto *J = cast<Instruction>(U);
5118             return Worklist.count(J) ||
5119                    (OI == getLoadStorePointerOperand(J) &&
5120                     isUniformDecision(J, VF));
5121           }))
5122         addToWorklistIfAllowed(OI);
5123     }
5124   }
5125 
5126   // Returns true if Ptr is the pointer operand of a memory access instruction
5127   // I, and I is known to not require scalarization.
5128   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5129     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5130   };
5131 
5132   // For an instruction to be added into Worklist above, all its users inside
5133   // the loop should also be in Worklist. However, this condition cannot be
5134   // true for phi nodes that form a cyclic dependence. We must process phi
5135   // nodes separately. An induction variable will remain uniform if all users
5136   // of the induction variable and induction variable update remain uniform.
5137   // The code below handles both pointer and non-pointer induction variables.
5138   for (auto &Induction : Legal->getInductionVars()) {
5139     auto *Ind = Induction.first;
5140     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5141 
5142     // Determine if all users of the induction variable are uniform after
5143     // vectorization.
5144     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5145       auto *I = cast<Instruction>(U);
5146       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5147              isVectorizedMemAccessUse(I, Ind);
5148     });
5149     if (!UniformInd)
5150       continue;
5151 
5152     // Determine if all users of the induction variable update instruction are
5153     // uniform after vectorization.
5154     auto UniformIndUpdate =
5155         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5156           auto *I = cast<Instruction>(U);
5157           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5158                  isVectorizedMemAccessUse(I, IndUpdate);
5159         });
5160     if (!UniformIndUpdate)
5161       continue;
5162 
5163     // The induction variable and its update instruction will remain uniform.
5164     addToWorklistIfAllowed(Ind);
5165     addToWorklistIfAllowed(IndUpdate);
5166   }
5167 
5168   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5169 }
5170 
5171 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5172   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5173 
5174   if (Legal->getRuntimePointerChecking()->Need) {
5175     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5176         "runtime pointer checks needed. Enable vectorization of this "
5177         "loop with '#pragma clang loop vectorize(enable)' when "
5178         "compiling with -Os/-Oz",
5179         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5180     return true;
5181   }
5182 
5183   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5184     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5185         "runtime SCEV checks needed. Enable vectorization of this "
5186         "loop with '#pragma clang loop vectorize(enable)' when "
5187         "compiling with -Os/-Oz",
5188         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5189     return true;
5190   }
5191 
5192   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5193   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5194     reportVectorizationFailure("Runtime stride check for small trip count",
5195         "runtime stride == 1 checks needed. Enable vectorization of "
5196         "this loop without such check by compiling with -Os/-Oz",
5197         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5198     return true;
5199   }
5200 
5201   return false;
5202 }
5203 
5204 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
5205                                                             unsigned UserIC) {
5206   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5207     // TODO: It may by useful to do since it's still likely to be dynamically
5208     // uniform if the target can skip.
5209     reportVectorizationFailure(
5210         "Not inserting runtime ptr check for divergent target",
5211         "runtime pointer checks needed. Not enabled for divergent target",
5212         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5213     return None;
5214   }
5215 
5216   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5217   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5218   if (TC == 1) {
5219     reportVectorizationFailure("Single iteration (non) loop",
5220         "loop trip count is one, irrelevant for vectorization",
5221         "SingleIterationLoop", ORE, TheLoop);
5222     return None;
5223   }
5224 
5225   switch (ScalarEpilogueStatus) {
5226   case CM_ScalarEpilogueAllowed:
5227     return UserVF ? UserVF : computeFeasibleMaxVF(TC);
5228   case CM_ScalarEpilogueNotNeededUsePredicate:
5229     LLVM_DEBUG(
5230         dbgs() << "LV: vector predicate hint/switch found.\n"
5231                << "LV: Not allowing scalar epilogue, creating predicated "
5232                << "vector loop.\n");
5233     break;
5234   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5235     // fallthrough as a special case of OptForSize
5236   case CM_ScalarEpilogueNotAllowedOptSize:
5237     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5238       LLVM_DEBUG(
5239           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5240     else
5241       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5242                         << "count.\n");
5243 
5244     // Bail if runtime checks are required, which are not good when optimising
5245     // for size.
5246     if (runtimeChecksRequired())
5247       return None;
5248     break;
5249   }
5250 
5251   // Now try the tail folding
5252 
5253   // Invalidate interleave groups that require an epilogue if we can't mask
5254   // the interleave-group.
5255   if (!useMaskedInterleavedAccesses(TTI)) {
5256     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5257            "No decisions should have been taken at this point");
5258     // Note: There is no need to invalidate any cost modeling decisions here, as
5259     // non where taken so far.
5260     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5261   }
5262 
5263   unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
5264   assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
5265   unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
5266   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5267     // Accept MaxVF if we do not have a tail.
5268     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5269     return MaxVF;
5270   }
5271 
5272   // If we don't know the precise trip count, or if the trip count that we
5273   // found modulo the vectorization factor is not zero, try to fold the tail
5274   // by masking.
5275   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5276   if (Legal->prepareToFoldTailByMasking()) {
5277     FoldTailByMasking = true;
5278     return MaxVF;
5279   }
5280 
5281   // If there was a tail-folding hint/switch, but we can't fold the tail by
5282   // masking, fallback to a vectorization with a scalar epilogue.
5283   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5284     if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) {
5285       LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5286       return None;
5287     }
5288     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5289                          "scalar epilogue instead.\n");
5290     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5291     return MaxVF;
5292   }
5293 
5294   if (TC == 0) {
5295     reportVectorizationFailure(
5296         "Unable to calculate the loop count due to complex control flow",
5297         "unable to calculate the loop count due to complex control flow",
5298         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5299     return None;
5300   }
5301 
5302   reportVectorizationFailure(
5303       "Cannot optimize for size and vectorize at the same time.",
5304       "cannot optimize for size and vectorize at the same time. "
5305       "Enable vectorization of this loop with '#pragma clang loop "
5306       "vectorize(enable)' when compiling with -Os/-Oz",
5307       "NoTailLoopWithOptForSize", ORE, TheLoop);
5308   return None;
5309 }
5310 
5311 unsigned
5312 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5313   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5314   unsigned SmallestType, WidestType;
5315   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5316   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5317 
5318   // Get the maximum safe dependence distance in bits computed by LAA.
5319   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5320   // the memory accesses that is most restrictive (involved in the smallest
5321   // dependence distance).
5322   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5323 
5324   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5325 
5326   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5327   // Note that both WidestRegister and WidestType may not be a powers of 2.
5328   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5329 
5330   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5331                     << " / " << WidestType << " bits.\n");
5332   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5333                     << WidestRegister << " bits.\n");
5334 
5335   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5336                                  " into one vector!");
5337   if (MaxVectorSize == 0) {
5338     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5339     MaxVectorSize = 1;
5340     return MaxVectorSize;
5341   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5342              isPowerOf2_32(ConstTripCount)) {
5343     // We need to clamp the VF to be the ConstTripCount. There is no point in
5344     // choosing a higher viable VF as done in the loop below.
5345     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5346                       << ConstTripCount << "\n");
5347     MaxVectorSize = ConstTripCount;
5348     return MaxVectorSize;
5349   }
5350 
5351   unsigned MaxVF = MaxVectorSize;
5352   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5353       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5354     // Collect all viable vectorization factors larger than the default MaxVF
5355     // (i.e. MaxVectorSize).
5356     SmallVector<ElementCount, 8> VFs;
5357     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5358     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5359       VFs.push_back(ElementCount::getFixed(VS));
5360 
5361     // For each VF calculate its register usage.
5362     auto RUs = calculateRegisterUsage(VFs);
5363 
5364     // Select the largest VF which doesn't require more registers than existing
5365     // ones.
5366     for (int i = RUs.size() - 1; i >= 0; --i) {
5367       bool Selected = true;
5368       for (auto& pair : RUs[i].MaxLocalUsers) {
5369         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5370         if (pair.second > TargetNumRegisters)
5371           Selected = false;
5372       }
5373       if (Selected) {
5374         MaxVF = VFs[i].getKnownMinValue();
5375         break;
5376       }
5377     }
5378     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5379       if (MaxVF < MinVF) {
5380         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5381                           << ") with target's minimum: " << MinVF << '\n');
5382         MaxVF = MinVF;
5383       }
5384     }
5385   }
5386   return MaxVF;
5387 }
5388 
5389 VectorizationFactor
5390 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5391   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5392   const float ScalarCost = Cost;
5393   unsigned Width = 1;
5394   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5395 
5396   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5397   if (ForceVectorization && MaxVF > 1) {
5398     // Ignore scalar width, because the user explicitly wants vectorization.
5399     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5400     // evaluation.
5401     Cost = std::numeric_limits<float>::max();
5402   }
5403 
5404   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5405     // Notice that the vector loop needs to be executed less times, so
5406     // we need to divide the cost of the vector loops by the width of
5407     // the vector elements.
5408     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5409     float VectorCost = C.first / (float)i;
5410     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5411                       << " costs: " << (int)VectorCost << ".\n");
5412     if (!C.second && !ForceVectorization) {
5413       LLVM_DEBUG(
5414           dbgs() << "LV: Not considering vector loop of width " << i
5415                  << " because it will not generate any vector instructions.\n");
5416       continue;
5417     }
5418     if (VectorCost < Cost) {
5419       Cost = VectorCost;
5420       Width = i;
5421     }
5422   }
5423 
5424   if (!EnableCondStoresVectorization && NumPredStores) {
5425     reportVectorizationFailure("There are conditional stores.",
5426         "store that is conditionally executed prevents vectorization",
5427         "ConditionalStore", ORE, TheLoop);
5428     Width = 1;
5429     Cost = ScalarCost;
5430   }
5431 
5432   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5433              << "LV: Vectorization seems to be not beneficial, "
5434              << "but was forced by a user.\n");
5435   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5436   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5437                                 (unsigned)(Width * Cost)};
5438   return Factor;
5439 }
5440 
5441 std::pair<unsigned, unsigned>
5442 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5443   unsigned MinWidth = -1U;
5444   unsigned MaxWidth = 8;
5445   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5446 
5447   // For each block.
5448   for (BasicBlock *BB : TheLoop->blocks()) {
5449     // For each instruction in the loop.
5450     for (Instruction &I : BB->instructionsWithoutDebug()) {
5451       Type *T = I.getType();
5452 
5453       // Skip ignored values.
5454       if (ValuesToIgnore.count(&I))
5455         continue;
5456 
5457       // Only examine Loads, Stores and PHINodes.
5458       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5459         continue;
5460 
5461       // Examine PHI nodes that are reduction variables. Update the type to
5462       // account for the recurrence type.
5463       if (auto *PN = dyn_cast<PHINode>(&I)) {
5464         if (!Legal->isReductionVariable(PN))
5465           continue;
5466         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5467         T = RdxDesc.getRecurrenceType();
5468       }
5469 
5470       // Examine the stored values.
5471       if (auto *ST = dyn_cast<StoreInst>(&I))
5472         T = ST->getValueOperand()->getType();
5473 
5474       // Ignore loaded pointer types and stored pointer types that are not
5475       // vectorizable.
5476       //
5477       // FIXME: The check here attempts to predict whether a load or store will
5478       //        be vectorized. We only know this for certain after a VF has
5479       //        been selected. Here, we assume that if an access can be
5480       //        vectorized, it will be. We should also look at extending this
5481       //        optimization to non-pointer types.
5482       //
5483       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5484           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5485         continue;
5486 
5487       MinWidth = std::min(MinWidth,
5488                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5489       MaxWidth = std::max(MaxWidth,
5490                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5491     }
5492   }
5493 
5494   return {MinWidth, MaxWidth};
5495 }
5496 
5497 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5498                                                            unsigned LoopCost) {
5499   // -- The interleave heuristics --
5500   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5501   // There are many micro-architectural considerations that we can't predict
5502   // at this level. For example, frontend pressure (on decode or fetch) due to
5503   // code size, or the number and capabilities of the execution ports.
5504   //
5505   // We use the following heuristics to select the interleave count:
5506   // 1. If the code has reductions, then we interleave to break the cross
5507   // iteration dependency.
5508   // 2. If the loop is really small, then we interleave to reduce the loop
5509   // overhead.
5510   // 3. We don't interleave if we think that we will spill registers to memory
5511   // due to the increased register pressure.
5512 
5513   if (!isScalarEpilogueAllowed())
5514     return 1;
5515 
5516   // We used the distance for the interleave count.
5517   if (Legal->getMaxSafeDepDistBytes() != -1U)
5518     return 1;
5519 
5520   // Do not interleave loops with a relatively small known or estimated trip
5521   // count.
5522   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5523   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5524     return 1;
5525 
5526   RegisterUsage R = calculateRegisterUsage({VF})[0];
5527   // We divide by these constants so assume that we have at least one
5528   // instruction that uses at least one register.
5529   for (auto& pair : R.MaxLocalUsers) {
5530     pair.second = std::max(pair.second, 1U);
5531   }
5532 
5533   // We calculate the interleave count using the following formula.
5534   // Subtract the number of loop invariants from the number of available
5535   // registers. These registers are used by all of the interleaved instances.
5536   // Next, divide the remaining registers by the number of registers that is
5537   // required by the loop, in order to estimate how many parallel instances
5538   // fit without causing spills. All of this is rounded down if necessary to be
5539   // a power of two. We want power of two interleave count to simplify any
5540   // addressing operations or alignment considerations.
5541   // We also want power of two interleave counts to ensure that the induction
5542   // variable of the vector loop wraps to zero, when tail is folded by masking;
5543   // this currently happens when OptForSize, in which case IC is set to 1 above.
5544   unsigned IC = UINT_MAX;
5545 
5546   for (auto& pair : R.MaxLocalUsers) {
5547     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5548     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5549                       << " registers of "
5550                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5551     if (VF == 1) {
5552       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5553         TargetNumRegisters = ForceTargetNumScalarRegs;
5554     } else {
5555       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5556         TargetNumRegisters = ForceTargetNumVectorRegs;
5557     }
5558     unsigned MaxLocalUsers = pair.second;
5559     unsigned LoopInvariantRegs = 0;
5560     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5561       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5562 
5563     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5564     // Don't count the induction variable as interleaved.
5565     if (EnableIndVarRegisterHeur) {
5566       TmpIC =
5567           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5568                         std::max(1U, (MaxLocalUsers - 1)));
5569     }
5570 
5571     IC = std::min(IC, TmpIC);
5572   }
5573 
5574   // Clamp the interleave ranges to reasonable counts.
5575   assert(!VF.isScalable() && "scalable vectors not yet supported.");
5576   unsigned MaxInterleaveCount =
5577       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5578 
5579   // Check if the user has overridden the max.
5580   if (VF == 1) {
5581     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5582       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5583   } else {
5584     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5585       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5586   }
5587 
5588   // If trip count is known or estimated compile time constant, limit the
5589   // interleave count to be less than the trip count divided by VF.
5590   if (BestKnownTC) {
5591     MaxInterleaveCount =
5592         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5593   }
5594 
5595   // If we did not calculate the cost for VF (because the user selected the VF)
5596   // then we calculate the cost of VF here.
5597   if (LoopCost == 0)
5598     LoopCost = expectedCost(VF).first;
5599 
5600   assert(LoopCost && "Non-zero loop cost expected");
5601 
5602   // Clamp the calculated IC to be between the 1 and the max interleave count
5603   // that the target and trip count allows.
5604   if (IC > MaxInterleaveCount)
5605     IC = MaxInterleaveCount;
5606   else if (IC < 1)
5607     IC = 1;
5608 
5609   // Interleave if we vectorized this loop and there is a reduction that could
5610   // benefit from interleaving.
5611   if (VF.isVector() && !Legal->getReductionVars().empty()) {
5612     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5613     return IC;
5614   }
5615 
5616   // Note that if we've already vectorized the loop we will have done the
5617   // runtime check and so interleaving won't require further checks.
5618   bool InterleavingRequiresRuntimePointerCheck =
5619       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5620 
5621   // We want to interleave small loops in order to reduce the loop overhead and
5622   // potentially expose ILP opportunities.
5623   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5624   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5625     // We assume that the cost overhead is 1 and we use the cost model
5626     // to estimate the cost of the loop and interleave until the cost of the
5627     // loop overhead is about 5% of the cost of the loop.
5628     unsigned SmallIC =
5629         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5630 
5631     // Interleave until store/load ports (estimated by max interleave count) are
5632     // saturated.
5633     unsigned NumStores = Legal->getNumStores();
5634     unsigned NumLoads = Legal->getNumLoads();
5635     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5636     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5637 
5638     // If we have a scalar reduction (vector reductions are already dealt with
5639     // by this point), we can increase the critical path length if the loop
5640     // we're interleaving is inside another loop. Limit, by default to 2, so the
5641     // critical path only gets increased by one reduction operation.
5642     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5643       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5644       SmallIC = std::min(SmallIC, F);
5645       StoresIC = std::min(StoresIC, F);
5646       LoadsIC = std::min(LoadsIC, F);
5647     }
5648 
5649     if (EnableLoadStoreRuntimeInterleave &&
5650         std::max(StoresIC, LoadsIC) > SmallIC) {
5651       LLVM_DEBUG(
5652           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5653       return std::max(StoresIC, LoadsIC);
5654     }
5655 
5656     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5657     return SmallIC;
5658   }
5659 
5660   // Interleave if this is a large loop (small loops are already dealt with by
5661   // this point) that could benefit from interleaving.
5662   bool HasReductions = !Legal->getReductionVars().empty();
5663   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5664     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5665     return IC;
5666   }
5667 
5668   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5669   return 1;
5670 }
5671 
5672 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5673 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5674   // This function calculates the register usage by measuring the highest number
5675   // of values that are alive at a single location. Obviously, this is a very
5676   // rough estimation. We scan the loop in a topological order in order and
5677   // assign a number to each instruction. We use RPO to ensure that defs are
5678   // met before their users. We assume that each instruction that has in-loop
5679   // users starts an interval. We record every time that an in-loop value is
5680   // used, so we have a list of the first and last occurrences of each
5681   // instruction. Next, we transpose this data structure into a multi map that
5682   // holds the list of intervals that *end* at a specific location. This multi
5683   // map allows us to perform a linear search. We scan the instructions linearly
5684   // and record each time that a new interval starts, by placing it in a set.
5685   // If we find this value in the multi-map then we remove it from the set.
5686   // The max register usage is the maximum size of the set.
5687   // We also search for instructions that are defined outside the loop, but are
5688   // used inside the loop. We need this number separately from the max-interval
5689   // usage number because when we unroll, loop-invariant values do not take
5690   // more register.
5691   LoopBlocksDFS DFS(TheLoop);
5692   DFS.perform(LI);
5693 
5694   RegisterUsage RU;
5695 
5696   // Each 'key' in the map opens a new interval. The values
5697   // of the map are the index of the 'last seen' usage of the
5698   // instruction that is the key.
5699   using IntervalMap = DenseMap<Instruction *, unsigned>;
5700 
5701   // Maps instruction to its index.
5702   SmallVector<Instruction *, 64> IdxToInstr;
5703   // Marks the end of each interval.
5704   IntervalMap EndPoint;
5705   // Saves the list of instruction indices that are used in the loop.
5706   SmallPtrSet<Instruction *, 8> Ends;
5707   // Saves the list of values that are used in the loop but are
5708   // defined outside the loop, such as arguments and constants.
5709   SmallPtrSet<Value *, 8> LoopInvariants;
5710 
5711   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5712     for (Instruction &I : BB->instructionsWithoutDebug()) {
5713       IdxToInstr.push_back(&I);
5714 
5715       // Save the end location of each USE.
5716       for (Value *U : I.operands()) {
5717         auto *Instr = dyn_cast<Instruction>(U);
5718 
5719         // Ignore non-instruction values such as arguments, constants, etc.
5720         if (!Instr)
5721           continue;
5722 
5723         // If this instruction is outside the loop then record it and continue.
5724         if (!TheLoop->contains(Instr)) {
5725           LoopInvariants.insert(Instr);
5726           continue;
5727         }
5728 
5729         // Overwrite previous end points.
5730         EndPoint[Instr] = IdxToInstr.size();
5731         Ends.insert(Instr);
5732       }
5733     }
5734   }
5735 
5736   // Saves the list of intervals that end with the index in 'key'.
5737   using InstrList = SmallVector<Instruction *, 2>;
5738   DenseMap<unsigned, InstrList> TransposeEnds;
5739 
5740   // Transpose the EndPoints to a list of values that end at each index.
5741   for (auto &Interval : EndPoint)
5742     TransposeEnds[Interval.second].push_back(Interval.first);
5743 
5744   SmallPtrSet<Instruction *, 8> OpenIntervals;
5745 
5746   // Get the size of the widest register.
5747   unsigned MaxSafeDepDist = -1U;
5748   if (Legal->getMaxSafeDepDistBytes() != -1U)
5749     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5750   unsigned WidestRegister =
5751       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5752   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5753 
5754   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5755   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5756 
5757   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5758 
5759   // A lambda that gets the register usage for the given type and VF.
5760   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) {
5761     if (Ty->isTokenTy())
5762       return 0U;
5763     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5764     assert(!VF.isScalable() && "scalable vectors not yet supported.");
5765     return std::max<unsigned>(1, VF.getKnownMinValue() * TypeSize /
5766                                      WidestRegister);
5767   };
5768 
5769   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5770     Instruction *I = IdxToInstr[i];
5771 
5772     // Remove all of the instructions that end at this location.
5773     InstrList &List = TransposeEnds[i];
5774     for (Instruction *ToRemove : List)
5775       OpenIntervals.erase(ToRemove);
5776 
5777     // Ignore instructions that are never used within the loop.
5778     if (!Ends.count(I))
5779       continue;
5780 
5781     // Skip ignored values.
5782     if (ValuesToIgnore.count(I))
5783       continue;
5784 
5785     // For each VF find the maximum usage of registers.
5786     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5787       // Count the number of live intervals.
5788       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5789 
5790       if (VFs[j].isScalar()) {
5791         for (auto Inst : OpenIntervals) {
5792           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5793           if (RegUsage.find(ClassID) == RegUsage.end())
5794             RegUsage[ClassID] = 1;
5795           else
5796             RegUsage[ClassID] += 1;
5797         }
5798       } else {
5799         collectUniformsAndScalars(VFs[j]);
5800         for (auto Inst : OpenIntervals) {
5801           // Skip ignored values for VF > 1.
5802           if (VecValuesToIgnore.count(Inst))
5803             continue;
5804           if (isScalarAfterVectorization(Inst, VFs[j])) {
5805             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5806             if (RegUsage.find(ClassID) == RegUsage.end())
5807               RegUsage[ClassID] = 1;
5808             else
5809               RegUsage[ClassID] += 1;
5810           } else {
5811             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5812             if (RegUsage.find(ClassID) == RegUsage.end())
5813               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5814             else
5815               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5816           }
5817         }
5818       }
5819 
5820       for (auto& pair : RegUsage) {
5821         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5822           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5823         else
5824           MaxUsages[j][pair.first] = pair.second;
5825       }
5826     }
5827 
5828     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5829                       << OpenIntervals.size() << '\n');
5830 
5831     // Add the current instruction to the list of open intervals.
5832     OpenIntervals.insert(I);
5833   }
5834 
5835   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5836     SmallMapVector<unsigned, unsigned, 4> Invariant;
5837 
5838     for (auto Inst : LoopInvariants) {
5839       unsigned Usage =
5840           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5841       unsigned ClassID =
5842           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
5843       if (Invariant.find(ClassID) == Invariant.end())
5844         Invariant[ClassID] = Usage;
5845       else
5846         Invariant[ClassID] += Usage;
5847     }
5848 
5849     LLVM_DEBUG({
5850       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5851       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5852              << " item\n";
5853       for (const auto &pair : MaxUsages[i]) {
5854         dbgs() << "LV(REG): RegisterClass: "
5855                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5856                << " registers\n";
5857       }
5858       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5859              << " item\n";
5860       for (const auto &pair : Invariant) {
5861         dbgs() << "LV(REG): RegisterClass: "
5862                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5863                << " registers\n";
5864       }
5865     });
5866 
5867     RU.LoopInvariantRegs = Invariant;
5868     RU.MaxLocalUsers = MaxUsages[i];
5869     RUs[i] = RU;
5870   }
5871 
5872   return RUs;
5873 }
5874 
5875 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5876   // TODO: Cost model for emulated masked load/store is completely
5877   // broken. This hack guides the cost model to use an artificially
5878   // high enough value to practically disable vectorization with such
5879   // operations, except where previously deployed legality hack allowed
5880   // using very low cost values. This is to avoid regressions coming simply
5881   // from moving "masked load/store" check from legality to cost model.
5882   // Masked Load/Gather emulation was previously never allowed.
5883   // Limited number of Masked Store/Scatter emulation was allowed.
5884   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5885   return isa<LoadInst>(I) ||
5886          (isa<StoreInst>(I) &&
5887           NumPredStores > NumberOfStoresToPredicate);
5888 }
5889 
5890 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5891   // If we aren't vectorizing the loop, or if we've already collected the
5892   // instructions to scalarize, there's nothing to do. Collection may already
5893   // have occurred if we have a user-selected VF and are now computing the
5894   // expected cost for interleaving.
5895   if (VF.isScalar() || VF.isZero() ||
5896       InstsToScalarize.find(VF) != InstsToScalarize.end())
5897     return;
5898 
5899   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5900   // not profitable to scalarize any instructions, the presence of VF in the
5901   // map will indicate that we've analyzed it already.
5902   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5903 
5904   // Find all the instructions that are scalar with predication in the loop and
5905   // determine if it would be better to not if-convert the blocks they are in.
5906   // If so, we also record the instructions to scalarize.
5907   for (BasicBlock *BB : TheLoop->blocks()) {
5908     if (!blockNeedsPredication(BB))
5909       continue;
5910     for (Instruction &I : *BB)
5911       if (isScalarWithPredication(&I)) {
5912         ScalarCostsTy ScalarCosts;
5913         // Do not apply discount logic if hacked cost is needed
5914         // for emulated masked memrefs.
5915         if (!useEmulatedMaskMemRefHack(&I) &&
5916             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5917           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5918         // Remember that BB will remain after vectorization.
5919         PredicatedBBsAfterVectorization.insert(BB);
5920       }
5921   }
5922 }
5923 
5924 int LoopVectorizationCostModel::computePredInstDiscount(
5925     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5926     ElementCount VF) {
5927   assert(!isUniformAfterVectorization(PredInst, VF) &&
5928          "Instruction marked uniform-after-vectorization will be predicated");
5929 
5930   // Initialize the discount to zero, meaning that the scalar version and the
5931   // vector version cost the same.
5932   int Discount = 0;
5933 
5934   // Holds instructions to analyze. The instructions we visit are mapped in
5935   // ScalarCosts. Those instructions are the ones that would be scalarized if
5936   // we find that the scalar version costs less.
5937   SmallVector<Instruction *, 8> Worklist;
5938 
5939   // Returns true if the given instruction can be scalarized.
5940   auto canBeScalarized = [&](Instruction *I) -> bool {
5941     // We only attempt to scalarize instructions forming a single-use chain
5942     // from the original predicated block that would otherwise be vectorized.
5943     // Although not strictly necessary, we give up on instructions we know will
5944     // already be scalar to avoid traversing chains that are unlikely to be
5945     // beneficial.
5946     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5947         isScalarAfterVectorization(I, VF))
5948       return false;
5949 
5950     // If the instruction is scalar with predication, it will be analyzed
5951     // separately. We ignore it within the context of PredInst.
5952     if (isScalarWithPredication(I))
5953       return false;
5954 
5955     // If any of the instruction's operands are uniform after vectorization,
5956     // the instruction cannot be scalarized. This prevents, for example, a
5957     // masked load from being scalarized.
5958     //
5959     // We assume we will only emit a value for lane zero of an instruction
5960     // marked uniform after vectorization, rather than VF identical values.
5961     // Thus, if we scalarize an instruction that uses a uniform, we would
5962     // create uses of values corresponding to the lanes we aren't emitting code
5963     // for. This behavior can be changed by allowing getScalarValue to clone
5964     // the lane zero values for uniforms rather than asserting.
5965     for (Use &U : I->operands())
5966       if (auto *J = dyn_cast<Instruction>(U.get()))
5967         if (isUniformAfterVectorization(J, VF))
5968           return false;
5969 
5970     // Otherwise, we can scalarize the instruction.
5971     return true;
5972   };
5973 
5974   // Compute the expected cost discount from scalarizing the entire expression
5975   // feeding the predicated instruction. We currently only consider expressions
5976   // that are single-use instruction chains.
5977   Worklist.push_back(PredInst);
5978   while (!Worklist.empty()) {
5979     Instruction *I = Worklist.pop_back_val();
5980 
5981     // If we've already analyzed the instruction, there's nothing to do.
5982     if (ScalarCosts.find(I) != ScalarCosts.end())
5983       continue;
5984 
5985     // Compute the cost of the vector instruction. Note that this cost already
5986     // includes the scalarization overhead of the predicated instruction.
5987     unsigned VectorCost = getInstructionCost(I, VF).first;
5988 
5989     // Compute the cost of the scalarized instruction. This cost is the cost of
5990     // the instruction as if it wasn't if-converted and instead remained in the
5991     // predicated block. We will scale this cost by block probability after
5992     // computing the scalarization overhead.
5993     assert(!VF.isScalable() && "scalable vectors not yet supported.");
5994     unsigned ScalarCost =
5995         VF.getKnownMinValue() *
5996         getInstructionCost(I, ElementCount::getFixed(1)).first;
5997 
5998     // Compute the scalarization overhead of needed insertelement instructions
5999     // and phi nodes.
6000     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6001       ScalarCost += TTI.getScalarizationOverhead(
6002           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6003           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6004       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6005       ScalarCost +=
6006           VF.getKnownMinValue() *
6007           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6008     }
6009 
6010     // Compute the scalarization overhead of needed extractelement
6011     // instructions. For each of the instruction's operands, if the operand can
6012     // be scalarized, add it to the worklist; otherwise, account for the
6013     // overhead.
6014     for (Use &U : I->operands())
6015       if (auto *J = dyn_cast<Instruction>(U.get())) {
6016         assert(VectorType::isValidElementType(J->getType()) &&
6017                "Instruction has non-scalar type");
6018         if (canBeScalarized(J))
6019           Worklist.push_back(J);
6020         else if (needsExtract(J, VF)) {
6021           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6022           ScalarCost += TTI.getScalarizationOverhead(
6023               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6024               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6025         }
6026       }
6027 
6028     // Scale the total scalar cost by block probability.
6029     ScalarCost /= getReciprocalPredBlockProb();
6030 
6031     // Compute the discount. A non-negative discount means the vector version
6032     // of the instruction costs more, and scalarizing would be beneficial.
6033     Discount += VectorCost - ScalarCost;
6034     ScalarCosts[I] = ScalarCost;
6035   }
6036 
6037   return Discount;
6038 }
6039 
6040 LoopVectorizationCostModel::VectorizationCostTy
6041 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6042   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6043   VectorizationCostTy Cost;
6044 
6045   // For each block.
6046   for (BasicBlock *BB : TheLoop->blocks()) {
6047     VectorizationCostTy BlockCost;
6048 
6049     // For each instruction in the old loop.
6050     for (Instruction &I : BB->instructionsWithoutDebug()) {
6051       // Skip ignored values.
6052       if (ValuesToIgnore.count(&I) ||
6053           (VF.isVector() && VecValuesToIgnore.count(&I)))
6054         continue;
6055 
6056       VectorizationCostTy C = getInstructionCost(&I, VF);
6057 
6058       // Check if we should override the cost.
6059       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6060         C.first = ForceTargetInstructionCost;
6061 
6062       BlockCost.first += C.first;
6063       BlockCost.second |= C.second;
6064       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6065                         << " for VF " << VF << " For instruction: " << I
6066                         << '\n');
6067     }
6068 
6069     // If we are vectorizing a predicated block, it will have been
6070     // if-converted. This means that the block's instructions (aside from
6071     // stores and instructions that may divide by zero) will now be
6072     // unconditionally executed. For the scalar case, we may not always execute
6073     // the predicated block. Thus, scale the block's cost by the probability of
6074     // executing it.
6075     if (VF.isScalar() && blockNeedsPredication(BB))
6076       BlockCost.first /= getReciprocalPredBlockProb();
6077 
6078     Cost.first += BlockCost.first;
6079     Cost.second |= BlockCost.second;
6080   }
6081 
6082   return Cost;
6083 }
6084 
6085 /// Gets Address Access SCEV after verifying that the access pattern
6086 /// is loop invariant except the induction variable dependence.
6087 ///
6088 /// This SCEV can be sent to the Target in order to estimate the address
6089 /// calculation cost.
6090 static const SCEV *getAddressAccessSCEV(
6091               Value *Ptr,
6092               LoopVectorizationLegality *Legal,
6093               PredicatedScalarEvolution &PSE,
6094               const Loop *TheLoop) {
6095 
6096   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6097   if (!Gep)
6098     return nullptr;
6099 
6100   // We are looking for a gep with all loop invariant indices except for one
6101   // which should be an induction variable.
6102   auto SE = PSE.getSE();
6103   unsigned NumOperands = Gep->getNumOperands();
6104   for (unsigned i = 1; i < NumOperands; ++i) {
6105     Value *Opd = Gep->getOperand(i);
6106     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6107         !Legal->isInductionVariable(Opd))
6108       return nullptr;
6109   }
6110 
6111   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6112   return PSE.getSCEV(Ptr);
6113 }
6114 
6115 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6116   return Legal->hasStride(I->getOperand(0)) ||
6117          Legal->hasStride(I->getOperand(1));
6118 }
6119 
6120 unsigned
6121 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6122                                                         ElementCount VF) {
6123   assert(VF.isVector() &&
6124          "Scalarization cost of instruction implies vectorization.");
6125   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6126   Type *ValTy = getMemInstValueType(I);
6127   auto SE = PSE.getSE();
6128 
6129   unsigned AS = getLoadStoreAddressSpace(I);
6130   Value *Ptr = getLoadStorePointerOperand(I);
6131   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6132 
6133   // Figure out whether the access is strided and get the stride value
6134   // if it's known in compile time
6135   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6136 
6137   // Get the cost of the scalar memory instruction and address computation.
6138   unsigned Cost =
6139       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6140 
6141   // Don't pass *I here, since it is scalar but will actually be part of a
6142   // vectorized loop where the user of it is a vectorized instruction.
6143   const Align Alignment = getLoadStoreAlignment(I);
6144   Cost += VF.getKnownMinValue() *
6145           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6146                               AS, TTI::TCK_RecipThroughput);
6147 
6148   // Get the overhead of the extractelement and insertelement instructions
6149   // we might create due to scalarization.
6150   Cost += getScalarizationOverhead(I, VF);
6151 
6152   // If we have a predicated store, it may not be executed for each vector
6153   // lane. Scale the cost by the probability of executing the predicated
6154   // block.
6155   if (isPredicatedInst(I)) {
6156     Cost /= getReciprocalPredBlockProb();
6157 
6158     if (useEmulatedMaskMemRefHack(I))
6159       // Artificially setting to a high enough value to practically disable
6160       // vectorization with such operations.
6161       Cost = 3000000;
6162   }
6163 
6164   return Cost;
6165 }
6166 
6167 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6168                                                              ElementCount VF) {
6169   Type *ValTy = getMemInstValueType(I);
6170   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6171   Value *Ptr = getLoadStorePointerOperand(I);
6172   unsigned AS = getLoadStoreAddressSpace(I);
6173   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6174   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6175 
6176   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6177          "Stride should be 1 or -1 for consecutive memory access");
6178   const Align Alignment = getLoadStoreAlignment(I);
6179   unsigned Cost = 0;
6180   if (Legal->isMaskRequired(I))
6181     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6182                                       CostKind);
6183   else
6184     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6185                                 CostKind, I);
6186 
6187   bool Reverse = ConsecutiveStride < 0;
6188   if (Reverse)
6189     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6190   return Cost;
6191 }
6192 
6193 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6194                                                          ElementCount VF) {
6195   Type *ValTy = getMemInstValueType(I);
6196   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6197   const Align Alignment = getLoadStoreAlignment(I);
6198   unsigned AS = getLoadStoreAddressSpace(I);
6199   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6200   if (isa<LoadInst>(I)) {
6201     return TTI.getAddressComputationCost(ValTy) +
6202            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6203                                CostKind) +
6204            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6205   }
6206   StoreInst *SI = cast<StoreInst>(I);
6207 
6208   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6209   return TTI.getAddressComputationCost(ValTy) +
6210          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6211                              CostKind) +
6212          (isLoopInvariantStoreValue
6213               ? 0
6214               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6215                                        VF.getKnownMinValue() - 1));
6216 }
6217 
6218 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6219                                                           ElementCount VF) {
6220   Type *ValTy = getMemInstValueType(I);
6221   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6222   const Align Alignment = getLoadStoreAlignment(I);
6223   const Value *Ptr = getLoadStorePointerOperand(I);
6224 
6225   return TTI.getAddressComputationCost(VectorTy) +
6226          TTI.getGatherScatterOpCost(
6227              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6228              TargetTransformInfo::TCK_RecipThroughput, I);
6229 }
6230 
6231 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6232                                                             ElementCount VF) {
6233   Type *ValTy = getMemInstValueType(I);
6234   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6235   unsigned AS = getLoadStoreAddressSpace(I);
6236 
6237   auto Group = getInterleavedAccessGroup(I);
6238   assert(Group && "Fail to get an interleaved access group.");
6239 
6240   unsigned InterleaveFactor = Group->getFactor();
6241   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6242   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6243 
6244   // Holds the indices of existing members in an interleaved load group.
6245   // An interleaved store group doesn't need this as it doesn't allow gaps.
6246   SmallVector<unsigned, 4> Indices;
6247   if (isa<LoadInst>(I)) {
6248     for (unsigned i = 0; i < InterleaveFactor; i++)
6249       if (Group->getMember(i))
6250         Indices.push_back(i);
6251   }
6252 
6253   // Calculate the cost of the whole interleaved group.
6254   bool UseMaskForGaps =
6255       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6256   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6257       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6258       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6259 
6260   if (Group->isReverse()) {
6261     // TODO: Add support for reversed masked interleaved access.
6262     assert(!Legal->isMaskRequired(I) &&
6263            "Reverse masked interleaved access not supported.");
6264     Cost += Group->getNumMembers() *
6265             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6266   }
6267   return Cost;
6268 }
6269 
6270 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6271                                                               ElementCount VF) {
6272   // Calculate scalar cost only. Vectorization cost should be ready at this
6273   // moment.
6274   if (VF.isScalar()) {
6275     Type *ValTy = getMemInstValueType(I);
6276     const Align Alignment = getLoadStoreAlignment(I);
6277     unsigned AS = getLoadStoreAddressSpace(I);
6278 
6279     return TTI.getAddressComputationCost(ValTy) +
6280            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6281                                TTI::TCK_RecipThroughput, I);
6282   }
6283   return getWideningCost(I, VF);
6284 }
6285 
6286 LoopVectorizationCostModel::VectorizationCostTy
6287 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6288                                                ElementCount VF) {
6289   assert(!VF.isScalable() &&
6290          "the cost model is not yet implemented for scalable vectorization");
6291   // If we know that this instruction will remain uniform, check the cost of
6292   // the scalar version.
6293   if (isUniformAfterVectorization(I, VF))
6294     VF = ElementCount::getFixed(1);
6295 
6296   if (VF.isVector() && isProfitableToScalarize(I, VF))
6297     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6298 
6299   // Forced scalars do not have any scalarization overhead.
6300   auto ForcedScalar = ForcedScalars.find(VF);
6301   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6302     auto InstSet = ForcedScalar->second;
6303     if (InstSet.count(I))
6304       return VectorizationCostTy(
6305           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6306            VF.getKnownMinValue()),
6307           false);
6308   }
6309 
6310   Type *VectorTy;
6311   unsigned C = getInstructionCost(I, VF, VectorTy);
6312 
6313   bool TypeNotScalarized =
6314       VF.isVector() && VectorTy->isVectorTy() &&
6315       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6316   return VectorizationCostTy(C, TypeNotScalarized);
6317 }
6318 
6319 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6320                                                               ElementCount VF) {
6321 
6322   assert(!VF.isScalable() &&
6323          "cannot compute scalarization overhead for scalable vectorization");
6324   if (VF.isScalar())
6325     return 0;
6326 
6327   unsigned Cost = 0;
6328   Type *RetTy = ToVectorTy(I->getType(), VF);
6329   if (!RetTy->isVoidTy() &&
6330       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6331     Cost += TTI.getScalarizationOverhead(
6332         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6333         true, false);
6334 
6335   // Some targets keep addresses scalar.
6336   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6337     return Cost;
6338 
6339   // Some targets support efficient element stores.
6340   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6341     return Cost;
6342 
6343   // Collect operands to consider.
6344   CallInst *CI = dyn_cast<CallInst>(I);
6345   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6346 
6347   // Skip operands that do not require extraction/scalarization and do not incur
6348   // any overhead.
6349   return Cost + TTI.getOperandsScalarizationOverhead(
6350                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6351 }
6352 
6353 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6354   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6355   if (VF.isScalar())
6356     return;
6357   NumPredStores = 0;
6358   for (BasicBlock *BB : TheLoop->blocks()) {
6359     // For each instruction in the old loop.
6360     for (Instruction &I : *BB) {
6361       Value *Ptr =  getLoadStorePointerOperand(&I);
6362       if (!Ptr)
6363         continue;
6364 
6365       // TODO: We should generate better code and update the cost model for
6366       // predicated uniform stores. Today they are treated as any other
6367       // predicated store (see added test cases in
6368       // invariant-store-vectorization.ll).
6369       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6370         NumPredStores++;
6371 
6372       if (Legal->isUniform(Ptr) &&
6373           // Conditional loads and stores should be scalarized and predicated.
6374           // isScalarWithPredication cannot be used here since masked
6375           // gather/scatters are not considered scalar with predication.
6376           !Legal->blockNeedsPredication(I.getParent())) {
6377         // TODO: Avoid replicating loads and stores instead of
6378         // relying on instcombine to remove them.
6379         // Load: Scalar load + broadcast
6380         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6381         unsigned Cost = getUniformMemOpCost(&I, VF);
6382         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6383         continue;
6384       }
6385 
6386       // We assume that widening is the best solution when possible.
6387       if (memoryInstructionCanBeWidened(&I, VF)) {
6388         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6389         int ConsecutiveStride =
6390                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6391         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6392                "Expected consecutive stride.");
6393         InstWidening Decision =
6394             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6395         setWideningDecision(&I, VF, Decision, Cost);
6396         continue;
6397       }
6398 
6399       // Choose between Interleaving, Gather/Scatter or Scalarization.
6400       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6401       unsigned NumAccesses = 1;
6402       if (isAccessInterleaved(&I)) {
6403         auto Group = getInterleavedAccessGroup(&I);
6404         assert(Group && "Fail to get an interleaved access group.");
6405 
6406         // Make one decision for the whole group.
6407         if (getWideningDecision(&I, VF) != CM_Unknown)
6408           continue;
6409 
6410         NumAccesses = Group->getNumMembers();
6411         if (interleavedAccessCanBeWidened(&I, VF))
6412           InterleaveCost = getInterleaveGroupCost(&I, VF);
6413       }
6414 
6415       unsigned GatherScatterCost =
6416           isLegalGatherOrScatter(&I)
6417               ? getGatherScatterCost(&I, VF) * NumAccesses
6418               : std::numeric_limits<unsigned>::max();
6419 
6420       unsigned ScalarizationCost =
6421           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6422 
6423       // Choose better solution for the current VF,
6424       // write down this decision and use it during vectorization.
6425       unsigned Cost;
6426       InstWidening Decision;
6427       if (InterleaveCost <= GatherScatterCost &&
6428           InterleaveCost < ScalarizationCost) {
6429         Decision = CM_Interleave;
6430         Cost = InterleaveCost;
6431       } else if (GatherScatterCost < ScalarizationCost) {
6432         Decision = CM_GatherScatter;
6433         Cost = GatherScatterCost;
6434       } else {
6435         Decision = CM_Scalarize;
6436         Cost = ScalarizationCost;
6437       }
6438       // If the instructions belongs to an interleave group, the whole group
6439       // receives the same decision. The whole group receives the cost, but
6440       // the cost will actually be assigned to one instruction.
6441       if (auto Group = getInterleavedAccessGroup(&I))
6442         setWideningDecision(Group, VF, Decision, Cost);
6443       else
6444         setWideningDecision(&I, VF, Decision, Cost);
6445     }
6446   }
6447 
6448   // Make sure that any load of address and any other address computation
6449   // remains scalar unless there is gather/scatter support. This avoids
6450   // inevitable extracts into address registers, and also has the benefit of
6451   // activating LSR more, since that pass can't optimize vectorized
6452   // addresses.
6453   if (TTI.prefersVectorizedAddressing())
6454     return;
6455 
6456   // Start with all scalar pointer uses.
6457   SmallPtrSet<Instruction *, 8> AddrDefs;
6458   for (BasicBlock *BB : TheLoop->blocks())
6459     for (Instruction &I : *BB) {
6460       Instruction *PtrDef =
6461         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6462       if (PtrDef && TheLoop->contains(PtrDef) &&
6463           getWideningDecision(&I, VF) != CM_GatherScatter)
6464         AddrDefs.insert(PtrDef);
6465     }
6466 
6467   // Add all instructions used to generate the addresses.
6468   SmallVector<Instruction *, 4> Worklist;
6469   for (auto *I : AddrDefs)
6470     Worklist.push_back(I);
6471   while (!Worklist.empty()) {
6472     Instruction *I = Worklist.pop_back_val();
6473     for (auto &Op : I->operands())
6474       if (auto *InstOp = dyn_cast<Instruction>(Op))
6475         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6476             AddrDefs.insert(InstOp).second)
6477           Worklist.push_back(InstOp);
6478   }
6479 
6480   for (auto *I : AddrDefs) {
6481     if (isa<LoadInst>(I)) {
6482       // Setting the desired widening decision should ideally be handled in
6483       // by cost functions, but since this involves the task of finding out
6484       // if the loaded register is involved in an address computation, it is
6485       // instead changed here when we know this is the case.
6486       InstWidening Decision = getWideningDecision(I, VF);
6487       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6488         // Scalarize a widened load of address.
6489         setWideningDecision(
6490             I, VF, CM_Scalarize,
6491             (VF.getKnownMinValue() *
6492              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6493       else if (auto Group = getInterleavedAccessGroup(I)) {
6494         // Scalarize an interleave group of address loads.
6495         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6496           if (Instruction *Member = Group->getMember(I))
6497             setWideningDecision(
6498                 Member, VF, CM_Scalarize,
6499                 (VF.getKnownMinValue() *
6500                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6501         }
6502       }
6503     } else
6504       // Make sure I gets scalarized and a cost estimate without
6505       // scalarization overhead.
6506       ForcedScalars[VF].insert(I);
6507   }
6508 }
6509 
6510 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6511                                                         ElementCount VF,
6512                                                         Type *&VectorTy) {
6513   Type *RetTy = I->getType();
6514   if (canTruncateToMinimalBitwidth(I, VF))
6515     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6516   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6517   auto SE = PSE.getSE();
6518   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6519 
6520   // TODO: We need to estimate the cost of intrinsic calls.
6521   switch (I->getOpcode()) {
6522   case Instruction::GetElementPtr:
6523     // We mark this instruction as zero-cost because the cost of GEPs in
6524     // vectorized code depends on whether the corresponding memory instruction
6525     // is scalarized or not. Therefore, we handle GEPs with the memory
6526     // instruction cost.
6527     return 0;
6528   case Instruction::Br: {
6529     // In cases of scalarized and predicated instructions, there will be VF
6530     // predicated blocks in the vectorized loop. Each branch around these
6531     // blocks requires also an extract of its vector compare i1 element.
6532     bool ScalarPredicatedBB = false;
6533     BranchInst *BI = cast<BranchInst>(I);
6534     if (VF.isVector() && BI->isConditional() &&
6535         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6536          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6537       ScalarPredicatedBB = true;
6538 
6539     if (ScalarPredicatedBB) {
6540       // Return cost for branches around scalarized and predicated blocks.
6541       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6542       auto *Vec_i1Ty =
6543           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6544       return (TTI.getScalarizationOverhead(
6545                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6546                   false, true) +
6547               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
6548                VF.getKnownMinValue()));
6549     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6550       // The back-edge branch will remain, as will all scalar branches.
6551       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6552     else
6553       // This branch will be eliminated by if-conversion.
6554       return 0;
6555     // Note: We currently assume zero cost for an unconditional branch inside
6556     // a predicated block since it will become a fall-through, although we
6557     // may decide in the future to call TTI for all branches.
6558   }
6559   case Instruction::PHI: {
6560     auto *Phi = cast<PHINode>(I);
6561 
6562     // First-order recurrences are replaced by vector shuffles inside the loop.
6563     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6564     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6565       return TTI.getShuffleCost(
6566           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
6567           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
6568 
6569     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6570     // converted into select instructions. We require N - 1 selects per phi
6571     // node, where N is the number of incoming values.
6572     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6573       return (Phi->getNumIncomingValues() - 1) *
6574              TTI.getCmpSelInstrCost(
6575                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6576                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6577                  CostKind);
6578 
6579     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6580   }
6581   case Instruction::UDiv:
6582   case Instruction::SDiv:
6583   case Instruction::URem:
6584   case Instruction::SRem:
6585     // If we have a predicated instruction, it may not be executed for each
6586     // vector lane. Get the scalarization cost and scale this amount by the
6587     // probability of executing the predicated block. If the instruction is not
6588     // predicated, we fall through to the next case.
6589     if (VF.isVector() && isScalarWithPredication(I)) {
6590       unsigned Cost = 0;
6591 
6592       // These instructions have a non-void type, so account for the phi nodes
6593       // that we will create. This cost is likely to be zero. The phi node
6594       // cost, if any, should be scaled by the block probability because it
6595       // models a copy at the end of each predicated block.
6596       Cost += VF.getKnownMinValue() *
6597               TTI.getCFInstrCost(Instruction::PHI, CostKind);
6598 
6599       // The cost of the non-predicated instruction.
6600       Cost += VF.getKnownMinValue() *
6601               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6602 
6603       // The cost of insertelement and extractelement instructions needed for
6604       // scalarization.
6605       Cost += getScalarizationOverhead(I, VF);
6606 
6607       // Scale the cost by the probability of executing the predicated blocks.
6608       // This assumes the predicated block for each vector lane is equally
6609       // likely.
6610       return Cost / getReciprocalPredBlockProb();
6611     }
6612     LLVM_FALLTHROUGH;
6613   case Instruction::Add:
6614   case Instruction::FAdd:
6615   case Instruction::Sub:
6616   case Instruction::FSub:
6617   case Instruction::Mul:
6618   case Instruction::FMul:
6619   case Instruction::FDiv:
6620   case Instruction::FRem:
6621   case Instruction::Shl:
6622   case Instruction::LShr:
6623   case Instruction::AShr:
6624   case Instruction::And:
6625   case Instruction::Or:
6626   case Instruction::Xor: {
6627     // Since we will replace the stride by 1 the multiplication should go away.
6628     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6629       return 0;
6630     // Certain instructions can be cheaper to vectorize if they have a constant
6631     // second vector operand. One example of this are shifts on x86.
6632     Value *Op2 = I->getOperand(1);
6633     TargetTransformInfo::OperandValueProperties Op2VP;
6634     TargetTransformInfo::OperandValueKind Op2VK =
6635         TTI.getOperandInfo(Op2, Op2VP);
6636     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6637       Op2VK = TargetTransformInfo::OK_UniformValue;
6638 
6639     SmallVector<const Value *, 4> Operands(I->operand_values());
6640     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6641     return N * TTI.getArithmeticInstrCost(
6642                    I->getOpcode(), VectorTy, CostKind,
6643                    TargetTransformInfo::OK_AnyValue,
6644                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6645   }
6646   case Instruction::FNeg: {
6647     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
6648     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6649     return N * TTI.getArithmeticInstrCost(
6650                    I->getOpcode(), VectorTy, CostKind,
6651                    TargetTransformInfo::OK_AnyValue,
6652                    TargetTransformInfo::OK_AnyValue,
6653                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6654                    I->getOperand(0), I);
6655   }
6656   case Instruction::Select: {
6657     SelectInst *SI = cast<SelectInst>(I);
6658     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6659     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6660     Type *CondTy = SI->getCondition()->getType();
6661     if (!ScalarCond) {
6662       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
6663       CondTy = VectorType::get(CondTy, VF);
6664     }
6665     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6666                                   CostKind, I);
6667   }
6668   case Instruction::ICmp:
6669   case Instruction::FCmp: {
6670     Type *ValTy = I->getOperand(0)->getType();
6671     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6672     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6673       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6674     VectorTy = ToVectorTy(ValTy, VF);
6675     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
6676                                   I);
6677   }
6678   case Instruction::Store:
6679   case Instruction::Load: {
6680     ElementCount Width = VF;
6681     if (Width.isVector()) {
6682       InstWidening Decision = getWideningDecision(I, Width);
6683       assert(Decision != CM_Unknown &&
6684              "CM decision should be taken at this point");
6685       if (Decision == CM_Scalarize)
6686         Width = ElementCount::getFixed(1);
6687     }
6688     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6689     return getMemoryInstructionCost(I, VF);
6690   }
6691   case Instruction::ZExt:
6692   case Instruction::SExt:
6693   case Instruction::FPToUI:
6694   case Instruction::FPToSI:
6695   case Instruction::FPExt:
6696   case Instruction::PtrToInt:
6697   case Instruction::IntToPtr:
6698   case Instruction::SIToFP:
6699   case Instruction::UIToFP:
6700   case Instruction::Trunc:
6701   case Instruction::FPTrunc:
6702   case Instruction::BitCast: {
6703     // Computes the CastContextHint from a Load/Store instruction.
6704     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6705       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6706              "Expected a load or a store!");
6707 
6708       if (VF.isScalar() || !TheLoop->contains(I))
6709         return TTI::CastContextHint::Normal;
6710 
6711       switch (getWideningDecision(I, VF)) {
6712       case LoopVectorizationCostModel::CM_GatherScatter:
6713         return TTI::CastContextHint::GatherScatter;
6714       case LoopVectorizationCostModel::CM_Interleave:
6715         return TTI::CastContextHint::Interleave;
6716       case LoopVectorizationCostModel::CM_Scalarize:
6717       case LoopVectorizationCostModel::CM_Widen:
6718         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6719                                         : TTI::CastContextHint::Normal;
6720       case LoopVectorizationCostModel::CM_Widen_Reverse:
6721         return TTI::CastContextHint::Reversed;
6722       case LoopVectorizationCostModel::CM_Unknown:
6723         llvm_unreachable("Instr did not go through cost modelling?");
6724       }
6725 
6726       llvm_unreachable("Unhandled case!");
6727     };
6728 
6729     unsigned Opcode = I->getOpcode();
6730     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6731     // For Trunc, the context is the only user, which must be a StoreInst.
6732     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6733       if (I->hasOneUse())
6734         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6735           CCH = ComputeCCH(Store);
6736     }
6737     // For Z/Sext, the context is the operand, which must be a LoadInst.
6738     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6739              Opcode == Instruction::FPExt) {
6740       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6741         CCH = ComputeCCH(Load);
6742     }
6743 
6744     // We optimize the truncation of induction variables having constant
6745     // integer steps. The cost of these truncations is the same as the scalar
6746     // operation.
6747     if (isOptimizableIVTruncate(I, VF)) {
6748       auto *Trunc = cast<TruncInst>(I);
6749       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6750                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6751     }
6752 
6753     Type *SrcScalarTy = I->getOperand(0)->getType();
6754     Type *SrcVecTy =
6755         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6756     if (canTruncateToMinimalBitwidth(I, VF)) {
6757       // This cast is going to be shrunk. This may remove the cast or it might
6758       // turn it into slightly different cast. For example, if MinBW == 16,
6759       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6760       //
6761       // Calculate the modified src and dest types.
6762       Type *MinVecTy = VectorTy;
6763       if (Opcode == Instruction::Trunc) {
6764         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6765         VectorTy =
6766             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6767       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
6768         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6769         VectorTy =
6770             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6771       }
6772     }
6773 
6774     assert(!VF.isScalable() && "VF is assumed to be non scalable");
6775     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6776     return N *
6777            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6778   }
6779   case Instruction::Call: {
6780     bool NeedToScalarize;
6781     CallInst *CI = cast<CallInst>(I);
6782     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6783     if (getVectorIntrinsicIDForCall(CI, TLI))
6784       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6785     return CallCost;
6786   }
6787   default:
6788     // The cost of executing VF copies of the scalar instruction. This opcode
6789     // is unknown. Assume that it is the same as 'mul'.
6790     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
6791                                        Instruction::Mul, VectorTy, CostKind) +
6792            getScalarizationOverhead(I, VF);
6793   } // end of switch.
6794 }
6795 
6796 char LoopVectorize::ID = 0;
6797 
6798 static const char lv_name[] = "Loop Vectorization";
6799 
6800 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6801 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6802 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6803 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6804 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6805 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6806 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6807 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6808 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6809 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6810 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6811 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6812 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6813 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6814 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6815 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6816 
6817 namespace llvm {
6818 
6819 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6820 
6821 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6822                               bool VectorizeOnlyWhenForced) {
6823   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6824 }
6825 
6826 } // end namespace llvm
6827 
6828 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6829   // Check if the pointer operand of a load or store instruction is
6830   // consecutive.
6831   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6832     return Legal->isConsecutivePtr(Ptr);
6833   return false;
6834 }
6835 
6836 void LoopVectorizationCostModel::collectValuesToIgnore() {
6837   // Ignore ephemeral values.
6838   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6839 
6840   // Ignore type-promoting instructions we identified during reduction
6841   // detection.
6842   for (auto &Reduction : Legal->getReductionVars()) {
6843     RecurrenceDescriptor &RedDes = Reduction.second;
6844     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6845     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6846   }
6847   // Ignore type-casting instructions we identified during induction
6848   // detection.
6849   for (auto &Induction : Legal->getInductionVars()) {
6850     InductionDescriptor &IndDes = Induction.second;
6851     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6852     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6853   }
6854 }
6855 
6856 void LoopVectorizationCostModel::collectInLoopReductions() {
6857   // For the moment, without predicated reduction instructions, we do not
6858   // support inloop reductions whilst folding the tail, and hence in those cases
6859   // all reductions are currently out of the loop.
6860   if (!PreferInLoopReductions || foldTailByMasking())
6861     return;
6862 
6863   for (auto &Reduction : Legal->getReductionVars()) {
6864     PHINode *Phi = Reduction.first;
6865     RecurrenceDescriptor &RdxDesc = Reduction.second;
6866 
6867     // We don't collect reductions that are type promoted (yet).
6868     if (RdxDesc.getRecurrenceType() != Phi->getType())
6869       continue;
6870 
6871     // Check that we can correctly put the reductions into the loop, by
6872     // finding the chain of operations that leads from the phi to the loop
6873     // exit value.
6874     SmallVector<Instruction *, 4> ReductionOperations =
6875         RdxDesc.getReductionOpChain(Phi, TheLoop);
6876     bool InLoop = !ReductionOperations.empty();
6877     if (InLoop)
6878       InLoopReductionChains[Phi] = ReductionOperations;
6879     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6880                       << " reduction for phi: " << *Phi << "\n");
6881   }
6882 }
6883 
6884 // TODO: we could return a pair of values that specify the max VF and
6885 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6886 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6887 // doesn't have a cost model that can choose which plan to execute if
6888 // more than one is generated.
6889 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6890                                  LoopVectorizationCostModel &CM) {
6891   unsigned WidestType;
6892   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6893   return WidestVectorRegBits / WidestType;
6894 }
6895 
6896 VectorizationFactor
6897 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6898   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
6899   ElementCount VF = UserVF;
6900   // Outer loop handling: They may require CFG and instruction level
6901   // transformations before even evaluating whether vectorization is profitable.
6902   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6903   // the vectorization pipeline.
6904   if (!OrigLoop->empty()) {
6905     // If the user doesn't provide a vectorization factor, determine a
6906     // reasonable one.
6907     if (UserVF.isZero()) {
6908       VF = ElementCount::getFixed(
6909           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
6910       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6911 
6912       // Make sure we have a VF > 1 for stress testing.
6913       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6914         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6915                           << "overriding computed VF.\n");
6916         VF = ElementCount::getFixed(4);
6917       }
6918     }
6919     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6920     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6921            "VF needs to be a power of two");
6922     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6923                       << "VF " << VF << " to build VPlans.\n");
6924     buildVPlans(VF.getKnownMinValue(), VF.getKnownMinValue());
6925 
6926     // For VPlan build stress testing, we bail out after VPlan construction.
6927     if (VPlanBuildStressTest)
6928       return VectorizationFactor::Disabled();
6929 
6930     return {VF, 0 /*Cost*/};
6931   }
6932 
6933   LLVM_DEBUG(
6934       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6935                 "VPlan-native path.\n");
6936   return VectorizationFactor::Disabled();
6937 }
6938 
6939 Optional<VectorizationFactor>
6940 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6941   assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
6942   assert(OrigLoop->empty() && "Inner loop expected.");
6943   Optional<unsigned> MaybeMaxVF =
6944       CM.computeMaxVF(UserVF.getKnownMinValue(), UserIC);
6945   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6946     return None;
6947 
6948   // Invalidate interleave groups if all blocks of loop will be predicated.
6949   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6950       !useMaskedInterleavedAccesses(*TTI)) {
6951     LLVM_DEBUG(
6952         dbgs()
6953         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6954            "which requires masked-interleaved support.\n");
6955     if (CM.InterleaveInfo.invalidateGroups())
6956       // Invalidating interleave groups also requires invalidating all decisions
6957       // based on them, which includes widening decisions and uniform and scalar
6958       // values.
6959       CM.invalidateCostModelingDecisions();
6960   }
6961 
6962   if (!UserVF.isZero()) {
6963     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6964     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6965            "VF needs to be a power of two");
6966     // Collect the instructions (and their associated costs) that will be more
6967     // profitable to scalarize.
6968     CM.selectUserVectorizationFactor(UserVF);
6969     CM.collectInLoopReductions();
6970     buildVPlansWithVPRecipes(UserVF.getKnownMinValue(),
6971                              UserVF.getKnownMinValue());
6972     LLVM_DEBUG(printPlans(dbgs()));
6973     return {{UserVF, 0}};
6974   }
6975 
6976   unsigned MaxVF = MaybeMaxVF.getValue();
6977   assert(MaxVF != 0 && "MaxVF is zero.");
6978 
6979   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6980     // Collect Uniform and Scalar instructions after vectorization with VF.
6981     CM.collectUniformsAndScalars(ElementCount::getFixed(VF));
6982 
6983     // Collect the instructions (and their associated costs) that will be more
6984     // profitable to scalarize.
6985     if (VF > 1)
6986       CM.collectInstsToScalarize(ElementCount::getFixed(VF));
6987   }
6988 
6989   CM.collectInLoopReductions();
6990 
6991   buildVPlansWithVPRecipes(1, MaxVF);
6992   LLVM_DEBUG(printPlans(dbgs()));
6993   if (MaxVF == 1)
6994     return VectorizationFactor::Disabled();
6995 
6996   // Select the optimal vectorization factor.
6997   return CM.selectVectorizationFactor(MaxVF);
6998 }
6999 
7000 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7001   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7002                     << '\n');
7003   BestVF = VF;
7004   BestUF = UF;
7005 
7006   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7007     return !Plan->hasVF(VF);
7008   });
7009   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7010 }
7011 
7012 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7013                                            DominatorTree *DT) {
7014   // Perform the actual loop transformation.
7015 
7016   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7017   VPCallbackILV CallbackILV(ILV);
7018 
7019   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7020 
7021   VPTransformState State{*BestVF, BestUF,      LI,
7022                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7023                          &ILV,    CallbackILV};
7024   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7025   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7026   State.CanonicalIV = ILV.Induction;
7027 
7028   //===------------------------------------------------===//
7029   //
7030   // Notice: any optimization or new instruction that go
7031   // into the code below should also be implemented in
7032   // the cost-model.
7033   //
7034   //===------------------------------------------------===//
7035 
7036   // 2. Copy and widen instructions from the old loop into the new loop.
7037   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7038   VPlans.front()->execute(&State);
7039 
7040   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7041   //    predication, updating analyses.
7042   ILV.fixVectorizedLoop();
7043 }
7044 
7045 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7046     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7047   BasicBlock *Latch = OrigLoop->getLoopLatch();
7048 
7049   // We create new control-flow for the vectorized loop, so the original
7050   // condition will be dead after vectorization if it's only used by the
7051   // branch.
7052   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
7053   if (Cmp && Cmp->hasOneUse())
7054     DeadInstructions.insert(Cmp);
7055 
7056   // We create new "steps" for induction variable updates to which the original
7057   // induction variables map. An original update instruction will be dead if
7058   // all its users except the induction variable are dead.
7059   for (auto &Induction : Legal->getInductionVars()) {
7060     PHINode *Ind = Induction.first;
7061     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7062     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7063           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7064         }))
7065       DeadInstructions.insert(IndUpdate);
7066 
7067     // We record as "Dead" also the type-casting instructions we had identified
7068     // during induction analysis. We don't need any handling for them in the
7069     // vectorized loop because we have proven that, under a proper runtime
7070     // test guarding the vectorized loop, the value of the phi, and the casted
7071     // value of the phi, are the same. The last instruction in this casting chain
7072     // will get its scalar/vector/widened def from the scalar/vector/widened def
7073     // of the respective phi node. Any other casts in the induction def-use chain
7074     // have no other uses outside the phi update chain, and will be ignored.
7075     InductionDescriptor &IndDes = Induction.second;
7076     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7077     DeadInstructions.insert(Casts.begin(), Casts.end());
7078   }
7079 }
7080 
7081 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7082 
7083 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7084 
7085 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7086                                         Instruction::BinaryOps BinOp) {
7087   // When unrolling and the VF is 1, we only need to add a simple scalar.
7088   Type *Ty = Val->getType();
7089   assert(!Ty->isVectorTy() && "Val must be a scalar");
7090 
7091   if (Ty->isFloatingPointTy()) {
7092     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7093 
7094     // Floating point operations had to be 'fast' to enable the unrolling.
7095     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7096     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7097   }
7098   Constant *C = ConstantInt::get(Ty, StartIdx);
7099   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7100 }
7101 
7102 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7103   SmallVector<Metadata *, 4> MDs;
7104   // Reserve first location for self reference to the LoopID metadata node.
7105   MDs.push_back(nullptr);
7106   bool IsUnrollMetadata = false;
7107   MDNode *LoopID = L->getLoopID();
7108   if (LoopID) {
7109     // First find existing loop unrolling disable metadata.
7110     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7111       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7112       if (MD) {
7113         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7114         IsUnrollMetadata =
7115             S && S->getString().startswith("llvm.loop.unroll.disable");
7116       }
7117       MDs.push_back(LoopID->getOperand(i));
7118     }
7119   }
7120 
7121   if (!IsUnrollMetadata) {
7122     // Add runtime unroll disable metadata.
7123     LLVMContext &Context = L->getHeader()->getContext();
7124     SmallVector<Metadata *, 1> DisableOperands;
7125     DisableOperands.push_back(
7126         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7127     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7128     MDs.push_back(DisableNode);
7129     MDNode *NewLoopID = MDNode::get(Context, MDs);
7130     // Set operand 0 to refer to the loop id itself.
7131     NewLoopID->replaceOperandWith(0, NewLoopID);
7132     L->setLoopID(NewLoopID);
7133   }
7134 }
7135 
7136 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7137     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7138   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
7139   bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start));
7140 
7141   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
7142     if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) {
7143       Range.End = TmpVF;
7144       break;
7145     }
7146 
7147   return PredicateAtRangeStart;
7148 }
7149 
7150 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7151 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7152 /// of VF's starting at a given VF and extending it as much as possible. Each
7153 /// vectorization decision can potentially shorten this sub-range during
7154 /// buildVPlan().
7155 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
7156   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7157     VFRange SubRange = {VF, MaxVF + 1};
7158     VPlans.push_back(buildVPlan(SubRange));
7159     VF = SubRange.End;
7160   }
7161 }
7162 
7163 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7164                                          VPlanPtr &Plan) {
7165   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7166 
7167   // Look for cached value.
7168   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7169   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7170   if (ECEntryIt != EdgeMaskCache.end())
7171     return ECEntryIt->second;
7172 
7173   VPValue *SrcMask = createBlockInMask(Src, Plan);
7174 
7175   // The terminator has to be a branch inst!
7176   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7177   assert(BI && "Unexpected terminator found");
7178 
7179   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7180     return EdgeMaskCache[Edge] = SrcMask;
7181 
7182   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
7183   assert(EdgeMask && "No Edge Mask found for condition");
7184 
7185   if (BI->getSuccessor(0) != Dst)
7186     EdgeMask = Builder.createNot(EdgeMask);
7187 
7188   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7189     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7190 
7191   return EdgeMaskCache[Edge] = EdgeMask;
7192 }
7193 
7194 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7195   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7196 
7197   // Look for cached value.
7198   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7199   if (BCEntryIt != BlockMaskCache.end())
7200     return BCEntryIt->second;
7201 
7202   // All-one mask is modelled as no-mask following the convention for masked
7203   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7204   VPValue *BlockMask = nullptr;
7205 
7206   if (OrigLoop->getHeader() == BB) {
7207     if (!CM.blockNeedsPredication(BB))
7208       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7209 
7210     // Introduce the early-exit compare IV <= BTC to form header block mask.
7211     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7212     // Start by constructing the desired canonical IV.
7213     VPValue *IV = nullptr;
7214     if (Legal->getPrimaryInduction())
7215       IV = Plan->getVPValue(Legal->getPrimaryInduction());
7216     else {
7217       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7218       Builder.getInsertBlock()->appendRecipe(IVRecipe);
7219       IV = IVRecipe->getVPValue();
7220     }
7221     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7222     bool TailFolded = !CM.isScalarEpilogueAllowed();
7223 
7224     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7225       // While ActiveLaneMask is a binary op that consumes the loop tripcount
7226       // as a second argument, we only pass the IV here and extract the
7227       // tripcount from the transform state where codegen of the VP instructions
7228       // happen.
7229       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7230     } else {
7231       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7232     }
7233     return BlockMaskCache[BB] = BlockMask;
7234   }
7235 
7236   // This is the block mask. We OR all incoming edges.
7237   for (auto *Predecessor : predecessors(BB)) {
7238     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7239     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7240       return BlockMaskCache[BB] = EdgeMask;
7241 
7242     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7243       BlockMask = EdgeMask;
7244       continue;
7245     }
7246 
7247     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7248   }
7249 
7250   return BlockMaskCache[BB] = BlockMask;
7251 }
7252 
7253 VPWidenMemoryInstructionRecipe *
7254 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7255                                   VPlanPtr &Plan) {
7256   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7257          "Must be called with either a load or store");
7258 
7259   auto willWiden = [&](ElementCount VF) -> bool {
7260     assert(!VF.isScalable() && "unexpected scalable ElementCount");
7261     if (VF.isScalar())
7262       return false;
7263     LoopVectorizationCostModel::InstWidening Decision =
7264         CM.getWideningDecision(I, VF);
7265     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7266            "CM decision should be taken at this point.");
7267     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7268       return true;
7269     if (CM.isScalarAfterVectorization(I, VF) ||
7270         CM.isProfitableToScalarize(I, VF))
7271       return false;
7272     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7273   };
7274 
7275   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7276     return nullptr;
7277 
7278   VPValue *Mask = nullptr;
7279   if (Legal->isMaskRequired(I))
7280     Mask = createBlockInMask(I->getParent(), Plan);
7281 
7282   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7283   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7284     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7285 
7286   StoreInst *Store = cast<StoreInst>(I);
7287   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7288   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7289 }
7290 
7291 VPWidenIntOrFpInductionRecipe *
7292 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
7293   // Check if this is an integer or fp induction. If so, build the recipe that
7294   // produces its scalar and vector values.
7295   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
7296   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
7297       II.getKind() == InductionDescriptor::IK_FpInduction)
7298     return new VPWidenIntOrFpInductionRecipe(Phi);
7299 
7300   return nullptr;
7301 }
7302 
7303 VPWidenIntOrFpInductionRecipe *
7304 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
7305                                                 VFRange &Range) const {
7306   // Optimize the special case where the source is a constant integer
7307   // induction variable. Notice that we can only optimize the 'trunc' case
7308   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7309   // (c) other casts depend on pointer size.
7310 
7311   // Determine whether \p K is a truncation based on an induction variable that
7312   // can be optimized.
7313   auto isOptimizableIVTruncate =
7314       [&](Instruction *K) -> std::function<bool(ElementCount)> {
7315     return [=](ElementCount VF) -> bool {
7316       return CM.isOptimizableIVTruncate(K, VF);
7317     };
7318   };
7319 
7320   if (LoopVectorizationPlanner::getDecisionAndClampRange(
7321           isOptimizableIVTruncate(I), Range))
7322     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
7323                                              I);
7324   return nullptr;
7325 }
7326 
7327 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
7328   // We know that all PHIs in non-header blocks are converted into selects, so
7329   // we don't have to worry about the insertion order and we can just use the
7330   // builder. At this point we generate the predication tree. There may be
7331   // duplications since this is a simple recursive scan, but future
7332   // optimizations will clean it up.
7333 
7334   SmallVector<VPValue *, 2> Operands;
7335   unsigned NumIncoming = Phi->getNumIncomingValues();
7336   for (unsigned In = 0; In < NumIncoming; In++) {
7337     VPValue *EdgeMask =
7338       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
7339     assert((EdgeMask || NumIncoming == 1) &&
7340            "Multiple predecessors with one having a full mask");
7341     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
7342     if (EdgeMask)
7343       Operands.push_back(EdgeMask);
7344   }
7345   return new VPBlendRecipe(Phi, Operands);
7346 }
7347 
7348 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
7349                                                    VPlan &Plan) const {
7350 
7351   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7352       [this, CI](ElementCount VF) {
7353         return CM.isScalarWithPredication(CI, VF);
7354       },
7355       Range);
7356 
7357   if (IsPredicated)
7358     return nullptr;
7359 
7360   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7361   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7362              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
7363     return nullptr;
7364 
7365   auto willWiden = [&](ElementCount VF) -> bool {
7366     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7367     // The following case may be scalarized depending on the VF.
7368     // The flag shows whether we use Intrinsic or a usual Call for vectorized
7369     // version of the instruction.
7370     // Is it beneficial to perform intrinsic call compared to lib call?
7371     bool NeedToScalarize = false;
7372     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
7373     bool UseVectorIntrinsic =
7374         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
7375     return UseVectorIntrinsic || !NeedToScalarize;
7376   };
7377 
7378   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7379     return nullptr;
7380 
7381   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
7382 }
7383 
7384 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7385   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7386          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7387   // Instruction should be widened, unless it is scalar after vectorization,
7388   // scalarization is profitable or it is predicated.
7389   auto WillScalarize = [this, I](ElementCount VF) -> bool {
7390     return CM.isScalarAfterVectorization(I, VF) ||
7391            CM.isProfitableToScalarize(I, VF) ||
7392            CM.isScalarWithPredication(I, VF);
7393   };
7394   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7395                                                              Range);
7396 }
7397 
7398 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
7399   auto IsVectorizableOpcode = [](unsigned Opcode) {
7400     switch (Opcode) {
7401     case Instruction::Add:
7402     case Instruction::And:
7403     case Instruction::AShr:
7404     case Instruction::BitCast:
7405     case Instruction::FAdd:
7406     case Instruction::FCmp:
7407     case Instruction::FDiv:
7408     case Instruction::FMul:
7409     case Instruction::FNeg:
7410     case Instruction::FPExt:
7411     case Instruction::FPToSI:
7412     case Instruction::FPToUI:
7413     case Instruction::FPTrunc:
7414     case Instruction::FRem:
7415     case Instruction::FSub:
7416     case Instruction::ICmp:
7417     case Instruction::IntToPtr:
7418     case Instruction::LShr:
7419     case Instruction::Mul:
7420     case Instruction::Or:
7421     case Instruction::PtrToInt:
7422     case Instruction::SDiv:
7423     case Instruction::Select:
7424     case Instruction::SExt:
7425     case Instruction::Shl:
7426     case Instruction::SIToFP:
7427     case Instruction::SRem:
7428     case Instruction::Sub:
7429     case Instruction::Trunc:
7430     case Instruction::UDiv:
7431     case Instruction::UIToFP:
7432     case Instruction::URem:
7433     case Instruction::Xor:
7434     case Instruction::ZExt:
7435       return true;
7436     }
7437     return false;
7438   };
7439 
7440   if (!IsVectorizableOpcode(I->getOpcode()))
7441     return nullptr;
7442 
7443   // Success: widen this instruction.
7444   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7445 }
7446 
7447 VPBasicBlock *VPRecipeBuilder::handleReplication(
7448     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7449     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7450     VPlanPtr &Plan) {
7451   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7452       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7453       Range);
7454 
7455   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7456       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
7457       Range);
7458 
7459   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
7460                                        IsUniform, IsPredicated);
7461   setRecipe(I, Recipe);
7462 
7463   // Find if I uses a predicated instruction. If so, it will use its scalar
7464   // value. Avoid hoisting the insert-element which packs the scalar value into
7465   // a vector value, as that happens iff all users use the vector value.
7466   for (auto &Op : I->operands())
7467     if (auto *PredInst = dyn_cast<Instruction>(Op))
7468       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7469         PredInst2Recipe[PredInst]->setAlsoPack(false);
7470 
7471   // Finalize the recipe for Instr, first if it is not predicated.
7472   if (!IsPredicated) {
7473     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7474     VPBB->appendRecipe(Recipe);
7475     return VPBB;
7476   }
7477   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7478   assert(VPBB->getSuccessors().empty() &&
7479          "VPBB has successors when handling predicated replication.");
7480   // Record predicated instructions for above packing optimizations.
7481   PredInst2Recipe[I] = Recipe;
7482   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7483   VPBlockUtils::insertBlockAfter(Region, VPBB);
7484   auto *RegSucc = new VPBasicBlock();
7485   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7486   return RegSucc;
7487 }
7488 
7489 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7490                                                       VPRecipeBase *PredRecipe,
7491                                                       VPlanPtr &Plan) {
7492   // Instructions marked for predication are replicated and placed under an
7493   // if-then construct to prevent side-effects.
7494 
7495   // Generate recipes to compute the block mask for this region.
7496   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7497 
7498   // Build the triangular if-then region.
7499   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7500   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7501   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7502   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7503   auto *PHIRecipe =
7504       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7505   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7506   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7507   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7508 
7509   // Note: first set Entry as region entry and then connect successors starting
7510   // from it in order, to propagate the "parent" of each VPBasicBlock.
7511   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7512   VPBlockUtils::connectBlocks(Pred, Exit);
7513 
7514   return Region;
7515 }
7516 
7517 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7518                                                       VFRange &Range,
7519                                                       VPlanPtr &Plan) {
7520   // First, check for specific widening recipes that deal with calls, memory
7521   // operations, inductions and Phi nodes.
7522   if (auto *CI = dyn_cast<CallInst>(Instr))
7523     return tryToWidenCall(CI, Range, *Plan);
7524 
7525   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7526     return tryToWidenMemory(Instr, Range, Plan);
7527 
7528   VPRecipeBase *Recipe;
7529   if (auto Phi = dyn_cast<PHINode>(Instr)) {
7530     if (Phi->getParent() != OrigLoop->getHeader())
7531       return tryToBlend(Phi, Plan);
7532     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7533       return Recipe;
7534     return new VPWidenPHIRecipe(Phi);
7535   }
7536 
7537   if (isa<TruncInst>(Instr) &&
7538       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7539     return Recipe;
7540 
7541   if (!shouldWiden(Instr, Range))
7542     return nullptr;
7543 
7544   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7545     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
7546                                 OrigLoop);
7547 
7548   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7549     bool InvariantCond =
7550         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7551     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
7552                                    InvariantCond);
7553   }
7554 
7555   return tryToWiden(Instr, *Plan);
7556 }
7557 
7558 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7559                                                         unsigned MaxVF) {
7560   assert(OrigLoop->empty() && "Inner loop expected.");
7561 
7562   // Collect conditions feeding internal conditional branches; they need to be
7563   // represented in VPlan for it to model masking.
7564   SmallPtrSet<Value *, 1> NeedDef;
7565 
7566   auto *Latch = OrigLoop->getLoopLatch();
7567   for (BasicBlock *BB : OrigLoop->blocks()) {
7568     if (BB == Latch)
7569       continue;
7570     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7571     if (Branch && Branch->isConditional())
7572       NeedDef.insert(Branch->getCondition());
7573   }
7574 
7575   // If the tail is to be folded by masking, the primary induction variable, if
7576   // exists needs to be represented in VPlan for it to model early-exit masking.
7577   // Also, both the Phi and the live-out instruction of each reduction are
7578   // required in order to introduce a select between them in VPlan.
7579   if (CM.foldTailByMasking()) {
7580     if (Legal->getPrimaryInduction())
7581       NeedDef.insert(Legal->getPrimaryInduction());
7582     for (auto &Reduction : Legal->getReductionVars()) {
7583       NeedDef.insert(Reduction.first);
7584       NeedDef.insert(Reduction.second.getLoopExitInstr());
7585     }
7586   }
7587 
7588   // Collect instructions from the original loop that will become trivially dead
7589   // in the vectorized loop. We don't need to vectorize these instructions. For
7590   // example, original induction update instructions can become dead because we
7591   // separately emit induction "steps" when generating code for the new loop.
7592   // Similarly, we create a new latch condition when setting up the structure
7593   // of the new loop, so the old one can become dead.
7594   SmallPtrSet<Instruction *, 4> DeadInstructions;
7595   collectTriviallyDeadInstructions(DeadInstructions);
7596 
7597   // Add assume instructions we need to drop to DeadInstructions, to prevent
7598   // them from being added to the VPlan.
7599   // TODO: We only need to drop assumes in blocks that get flattend. If the
7600   // control flow is preserved, we should keep them.
7601   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7602   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7603 
7604   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7605   // Dead instructions do not need sinking. Remove them from SinkAfter.
7606   for (Instruction *I : DeadInstructions)
7607     SinkAfter.erase(I);
7608 
7609   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7610     VFRange SubRange = {VF, MaxVF + 1};
7611     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7612                                              DeadInstructions, SinkAfter));
7613     VF = SubRange.End;
7614   }
7615 }
7616 
7617 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7618     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7619     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7620     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7621 
7622   // Hold a mapping from predicated instructions to their recipes, in order to
7623   // fix their AlsoPack behavior if a user is determined to replicate and use a
7624   // scalar instead of vector value.
7625   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7626 
7627   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7628 
7629   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7630 
7631   // ---------------------------------------------------------------------------
7632   // Pre-construction: record ingredients whose recipes we'll need to further
7633   // process after constructing the initial VPlan.
7634   // ---------------------------------------------------------------------------
7635 
7636   // Mark instructions we'll need to sink later and their targets as
7637   // ingredients whose recipe we'll need to record.
7638   for (auto &Entry : SinkAfter) {
7639     RecipeBuilder.recordRecipeOf(Entry.first);
7640     RecipeBuilder.recordRecipeOf(Entry.second);
7641   }
7642   for (auto &Reduction : CM.getInLoopReductionChains()) {
7643     PHINode *Phi = Reduction.first;
7644     RecurrenceDescriptor::RecurrenceKind Kind =
7645         Legal->getReductionVars()[Phi].getRecurrenceKind();
7646     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7647 
7648     RecipeBuilder.recordRecipeOf(Phi);
7649     for (auto &R : ReductionOperations) {
7650       RecipeBuilder.recordRecipeOf(R);
7651       // For min/max reducitons, where we have a pair of icmp/select, we also
7652       // need to record the ICmp recipe, so it can be removed later.
7653       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7654           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7655         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
7656       }
7657     }
7658   }
7659 
7660   // For each interleave group which is relevant for this (possibly trimmed)
7661   // Range, add it to the set of groups to be later applied to the VPlan and add
7662   // placeholders for its members' Recipes which we'll be replacing with a
7663   // single VPInterleaveRecipe.
7664   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7665     auto applyIG = [IG, this](ElementCount VF) -> bool {
7666       return (VF.isVector() && // Query is illegal for VF == 1
7667               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7668                   LoopVectorizationCostModel::CM_Interleave);
7669     };
7670     if (!getDecisionAndClampRange(applyIG, Range))
7671       continue;
7672     InterleaveGroups.insert(IG);
7673     for (unsigned i = 0; i < IG->getFactor(); i++)
7674       if (Instruction *Member = IG->getMember(i))
7675         RecipeBuilder.recordRecipeOf(Member);
7676   };
7677 
7678   // ---------------------------------------------------------------------------
7679   // Build initial VPlan: Scan the body of the loop in a topological order to
7680   // visit each basic block after having visited its predecessor basic blocks.
7681   // ---------------------------------------------------------------------------
7682 
7683   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7684   auto Plan = std::make_unique<VPlan>();
7685   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7686   Plan->setEntry(VPBB);
7687 
7688   // Represent values that will have defs inside VPlan.
7689   for (Value *V : NeedDef)
7690     Plan->addVPValue(V);
7691 
7692   // Scan the body of the loop in a topological order to visit each basic block
7693   // after having visited its predecessor basic blocks.
7694   LoopBlocksDFS DFS(OrigLoop);
7695   DFS.perform(LI);
7696 
7697   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7698     // Relevant instructions from basic block BB will be grouped into VPRecipe
7699     // ingredients and fill a new VPBasicBlock.
7700     unsigned VPBBsForBB = 0;
7701     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7702     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7703     VPBB = FirstVPBBForBB;
7704     Builder.setInsertPoint(VPBB);
7705 
7706     // Introduce each ingredient into VPlan.
7707     // TODO: Model and preserve debug instrinsics in VPlan.
7708     for (Instruction &I : BB->instructionsWithoutDebug()) {
7709       Instruction *Instr = &I;
7710 
7711       // First filter out irrelevant instructions, to ensure no recipes are
7712       // built for them.
7713       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
7714         continue;
7715 
7716       if (auto Recipe =
7717               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7718         RecipeBuilder.setRecipe(Instr, Recipe);
7719         VPBB->appendRecipe(Recipe);
7720         continue;
7721       }
7722 
7723       // Otherwise, if all widening options failed, Instruction is to be
7724       // replicated. This may create a successor for VPBB.
7725       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7726           Instr, Range, VPBB, PredInst2Recipe, Plan);
7727       if (NextVPBB != VPBB) {
7728         VPBB = NextVPBB;
7729         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7730                                     : "");
7731       }
7732     }
7733   }
7734 
7735   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7736   // may also be empty, such as the last one VPBB, reflecting original
7737   // basic-blocks with no recipes.
7738   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7739   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7740   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7741   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7742   delete PreEntry;
7743 
7744   // ---------------------------------------------------------------------------
7745   // Transform initial VPlan: Apply previously taken decisions, in order, to
7746   // bring the VPlan to its final state.
7747   // ---------------------------------------------------------------------------
7748 
7749   // Apply Sink-After legal constraints.
7750   for (auto &Entry : SinkAfter) {
7751     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7752     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7753     Sink->moveAfter(Target);
7754   }
7755 
7756   // Interleave memory: for each Interleave Group we marked earlier as relevant
7757   // for this VPlan, replace the Recipes widening its memory instructions with a
7758   // single VPInterleaveRecipe at its insertion point.
7759   for (auto IG : InterleaveGroups) {
7760     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7761         RecipeBuilder.getRecipe(IG->getInsertPos()));
7762     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7763         ->insertBefore(Recipe);
7764 
7765     for (unsigned i = 0; i < IG->getFactor(); ++i)
7766       if (Instruction *Member = IG->getMember(i)) {
7767         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7768       }
7769   }
7770 
7771   // Adjust the recipes for any inloop reductions.
7772   if (Range.Start > 1)
7773     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
7774 
7775   // Finally, if tail is folded by masking, introduce selects between the phi
7776   // and the live-out instruction of each reduction, at the end of the latch.
7777   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
7778     Builder.setInsertPoint(VPBB);
7779     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7780     for (auto &Reduction : Legal->getReductionVars()) {
7781       assert(!CM.isInLoopReduction(Reduction.first) &&
7782              "Didn't expect inloop tail folded reduction yet!");
7783       VPValue *Phi = Plan->getVPValue(Reduction.first);
7784       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7785       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7786     }
7787   }
7788 
7789   std::string PlanName;
7790   raw_string_ostream RSO(PlanName);
7791   ElementCount VF = ElementCount::getFixed(Range.Start);
7792   Plan->addVF(VF);
7793   RSO << "Initial VPlan for VF={" << VF;
7794   for (VF *= 2; VF.getKnownMinValue() < Range.End; VF *= 2) {
7795     Plan->addVF(VF);
7796     RSO << "," << VF;
7797   }
7798   RSO << "},UF>=1";
7799   RSO.flush();
7800   Plan->setName(PlanName);
7801 
7802   return Plan;
7803 }
7804 
7805 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7806   // Outer loop handling: They may require CFG and instruction level
7807   // transformations before even evaluating whether vectorization is profitable.
7808   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7809   // the vectorization pipeline.
7810   assert(!OrigLoop->empty());
7811   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7812 
7813   // Create new empty VPlan
7814   auto Plan = std::make_unique<VPlan>();
7815 
7816   // Build hierarchical CFG
7817   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7818   HCFGBuilder.buildHierarchicalCFG();
7819 
7820   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7821     Plan->addVF(ElementCount::getFixed(VF));
7822 
7823   if (EnableVPlanPredication) {
7824     VPlanPredicator VPP(*Plan);
7825     VPP.predicate();
7826 
7827     // Avoid running transformation to recipes until masked code generation in
7828     // VPlan-native path is in place.
7829     return Plan;
7830   }
7831 
7832   SmallPtrSet<Instruction *, 1> DeadInstructions;
7833   VPlanTransforms::VPInstructionsToVPRecipes(
7834       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7835   return Plan;
7836 }
7837 
7838 // Adjust the recipes for any inloop reductions. The chain of instructions
7839 // leading from the loop exit instr to the phi need to be converted to
7840 // reductions, with one operand being vector and the other being the scalar
7841 // reduction chain.
7842 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
7843     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
7844   for (auto &Reduction : CM.getInLoopReductionChains()) {
7845     PHINode *Phi = Reduction.first;
7846     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
7847     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7848 
7849     // ReductionOperations are orders top-down from the phi's use to the
7850     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
7851     // which of the two operands will remain scalar and which will be reduced.
7852     // For minmax the chain will be the select instructions.
7853     Instruction *Chain = Phi;
7854     for (Instruction *R : ReductionOperations) {
7855       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
7856       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
7857 
7858       VPValue *ChainOp = Plan->getVPValue(Chain);
7859       unsigned FirstOpId;
7860       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7861           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7862         assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC &&
7863                "Expected to replace a VPWidenSelectSC");
7864         FirstOpId = 1;
7865       } else {
7866         assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&
7867                "Expected to replace a VPWidenSC");
7868         FirstOpId = 0;
7869       }
7870       unsigned VecOpId =
7871           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
7872       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
7873 
7874       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
7875           &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI);
7876       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
7877       WidenRecipe->eraseFromParent();
7878 
7879       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7880           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7881         VPRecipeBase *CompareRecipe =
7882             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
7883         assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&
7884                "Expected to replace a VPWidenSC");
7885         CompareRecipe->eraseFromParent();
7886       }
7887       Chain = R;
7888     }
7889   }
7890 }
7891 
7892 Value* LoopVectorizationPlanner::VPCallbackILV::
7893 getOrCreateVectorValues(Value *V, unsigned Part) {
7894       return ILV.getOrCreateVectorValue(V, Part);
7895 }
7896 
7897 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7898     Value *V, const VPIteration &Instance) {
7899   return ILV.getOrCreateScalarValue(V, Instance);
7900 }
7901 
7902 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7903                                VPSlotTracker &SlotTracker) const {
7904   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7905   IG->getInsertPos()->printAsOperand(O, false);
7906   O << ", ";
7907   getAddr()->printAsOperand(O, SlotTracker);
7908   VPValue *Mask = getMask();
7909   if (Mask) {
7910     O << ", ";
7911     Mask->printAsOperand(O, SlotTracker);
7912   }
7913   for (unsigned i = 0; i < IG->getFactor(); ++i)
7914     if (Instruction *I = IG->getMember(i))
7915       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
7916 }
7917 
7918 void VPWidenCallRecipe::execute(VPTransformState &State) {
7919   State.ILV->widenCallInstruction(Ingredient, User, State);
7920 }
7921 
7922 void VPWidenSelectRecipe::execute(VPTransformState &State) {
7923   State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State);
7924 }
7925 
7926 void VPWidenRecipe::execute(VPTransformState &State) {
7927   State.ILV->widenInstruction(Ingredient, User, State);
7928 }
7929 
7930 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7931   State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant,
7932                       IsIndexLoopInvariant, State);
7933 }
7934 
7935 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7936   assert(!State.Instance && "Int or FP induction being replicated.");
7937   State.ILV->widenIntOrFpInduction(IV, Trunc);
7938 }
7939 
7940 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7941   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7942 }
7943 
7944 void VPBlendRecipe::execute(VPTransformState &State) {
7945   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7946   // We know that all PHIs in non-header blocks are converted into
7947   // selects, so we don't have to worry about the insertion order and we
7948   // can just use the builder.
7949   // At this point we generate the predication tree. There may be
7950   // duplications since this is a simple recursive scan, but future
7951   // optimizations will clean it up.
7952 
7953   unsigned NumIncoming = getNumIncomingValues();
7954 
7955   // Generate a sequence of selects of the form:
7956   // SELECT(Mask3, In3,
7957   //        SELECT(Mask2, In2,
7958   //               SELECT(Mask1, In1,
7959   //                      In0)))
7960   // Note that Mask0 is never used: lanes for which no path reaches this phi and
7961   // are essentially undef are taken from In0.
7962   InnerLoopVectorizer::VectorParts Entry(State.UF);
7963   for (unsigned In = 0; In < NumIncoming; ++In) {
7964     for (unsigned Part = 0; Part < State.UF; ++Part) {
7965       // We might have single edge PHIs (blocks) - use an identity
7966       // 'select' for the first PHI operand.
7967       Value *In0 = State.get(getIncomingValue(In), Part);
7968       if (In == 0)
7969         Entry[Part] = In0; // Initialize with the first incoming value.
7970       else {
7971         // Select between the current value and the previous incoming edge
7972         // based on the incoming mask.
7973         Value *Cond = State.get(getMask(In), Part);
7974         Entry[Part] =
7975             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7976       }
7977     }
7978   }
7979   for (unsigned Part = 0; Part < State.UF; ++Part)
7980     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7981 }
7982 
7983 void VPInterleaveRecipe::execute(VPTransformState &State) {
7984   assert(!State.Instance && "Interleave group being replicated.");
7985   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
7986 }
7987 
7988 void VPReductionRecipe::execute(VPTransformState &State) {
7989   assert(!State.Instance && "Reduction being replicated.");
7990   for (unsigned Part = 0; Part < State.UF; ++Part) {
7991     unsigned Kind = RdxDesc->getRecurrenceKind();
7992     Value *NewVecOp = State.get(VecOp, Part);
7993     Value *NewRed =
7994         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
7995     Value *PrevInChain = State.get(ChainOp, Part);
7996     Value *NextInChain;
7997     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7998         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7999       NextInChain =
8000           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
8001                          NewRed, PrevInChain);
8002     } else {
8003       NextInChain = State.Builder.CreateBinOp(
8004           (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain);
8005     }
8006     State.ValueMap.setVectorValue(I, Part, NextInChain);
8007   }
8008 }
8009 
8010 void VPReplicateRecipe::execute(VPTransformState &State) {
8011   if (State.Instance) { // Generate a single instance.
8012     State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance,
8013                                     IsPredicated, State);
8014     // Insert scalar instance packing it into a vector.
8015     if (AlsoPack && State.VF.isVector()) {
8016       // If we're constructing lane 0, initialize to start from undef.
8017       if (State.Instance->Lane == 0) {
8018         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8019         Value *Undef =
8020             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
8021         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
8022       }
8023       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
8024     }
8025     return;
8026   }
8027 
8028   // Generate scalar instances for all VF lanes of all UF parts, unless the
8029   // instruction is uniform inwhich case generate only the first lane for each
8030   // of the UF parts.
8031   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8032   for (unsigned Part = 0; Part < State.UF; ++Part)
8033     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8034       State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane},
8035                                       IsPredicated, State);
8036 }
8037 
8038 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8039   assert(State.Instance && "Branch on Mask works only on single instance.");
8040 
8041   unsigned Part = State.Instance->Part;
8042   unsigned Lane = State.Instance->Lane;
8043 
8044   Value *ConditionBit = nullptr;
8045   VPValue *BlockInMask = getMask();
8046   if (BlockInMask) {
8047     ConditionBit = State.get(BlockInMask, Part);
8048     if (ConditionBit->getType()->isVectorTy())
8049       ConditionBit = State.Builder.CreateExtractElement(
8050           ConditionBit, State.Builder.getInt32(Lane));
8051   } else // Block in mask is all-one.
8052     ConditionBit = State.Builder.getTrue();
8053 
8054   // Replace the temporary unreachable terminator with a new conditional branch,
8055   // whose two destinations will be set later when they are created.
8056   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8057   assert(isa<UnreachableInst>(CurrentTerminator) &&
8058          "Expected to replace unreachable terminator with conditional branch.");
8059   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8060   CondBr->setSuccessor(0, nullptr);
8061   ReplaceInstWithInst(CurrentTerminator, CondBr);
8062 }
8063 
8064 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8065   assert(State.Instance && "Predicated instruction PHI works per instance.");
8066   Instruction *ScalarPredInst = cast<Instruction>(
8067       State.ValueMap.getScalarValue(PredInst, *State.Instance));
8068   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8069   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8070   assert(PredicatingBB && "Predicated block has no single predecessor.");
8071 
8072   // By current pack/unpack logic we need to generate only a single phi node: if
8073   // a vector value for the predicated instruction exists at this point it means
8074   // the instruction has vector users only, and a phi for the vector value is
8075   // needed. In this case the recipe of the predicated instruction is marked to
8076   // also do that packing, thereby "hoisting" the insert-element sequence.
8077   // Otherwise, a phi node for the scalar value is needed.
8078   unsigned Part = State.Instance->Part;
8079   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8080     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8081     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8082     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8083     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8084     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8085     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8086   } else {
8087     Type *PredInstType = PredInst->getType();
8088     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8089     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8090     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8091     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8092   }
8093 }
8094 
8095 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8096   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
8097   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
8098                                         getMask());
8099 }
8100 
8101 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8102 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8103 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8104 // for predication.
8105 static ScalarEpilogueLowering getScalarEpilogueLowering(
8106     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8107     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8108     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8109     LoopVectorizationLegality &LVL) {
8110   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8111   // don't look at hints or options, and don't request a scalar epilogue.
8112   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8113   // LoopAccessInfo (due to code dependency and not being able to reliably get
8114   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8115   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8116   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8117   // back to the old way and vectorize with versioning when forced. See D81345.)
8118   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8119                                                       PGSOQueryType::IRPass) &&
8120                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8121     return CM_ScalarEpilogueNotAllowedOptSize;
8122 
8123   bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() &&
8124                               !PreferPredicateOverEpilogue;
8125 
8126   // 2) Next, if disabling predication is requested on the command line, honour
8127   // this and request a scalar epilogue.
8128   if (PredicateOptDisabled)
8129     return CM_ScalarEpilogueAllowed;
8130 
8131   // 3) and 4) look if enabling predication is requested on the command line,
8132   // with a loop hint, or if the TTI hook indicates this is profitable, request
8133   // predication.
8134   if (PreferPredicateOverEpilogue ||
8135       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
8136       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8137                                         LVL.getLAI()) &&
8138        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
8139     return CM_ScalarEpilogueNotNeededUsePredicate;
8140 
8141   return CM_ScalarEpilogueAllowed;
8142 }
8143 
8144 // Process the loop in the VPlan-native vectorization path. This path builds
8145 // VPlan upfront in the vectorization pipeline, which allows to apply
8146 // VPlan-to-VPlan transformations from the very beginning without modifying the
8147 // input LLVM IR.
8148 static bool processLoopInVPlanNativePath(
8149     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8150     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8151     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8152     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8153     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8154 
8155   if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
8156     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8157     return false;
8158   }
8159   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8160   Function *F = L->getHeader()->getParent();
8161   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8162 
8163   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8164       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8165 
8166   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8167                                 &Hints, IAI);
8168   // Use the planner for outer loop vectorization.
8169   // TODO: CM is not used at this point inside the planner. Turn CM into an
8170   // optional argument if we don't need it in the future.
8171   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8172 
8173   // Get user vectorization factor.
8174   const unsigned UserVF = Hints.getWidth();
8175 
8176   // Plan how to best vectorize, return the best VF and its cost.
8177   const VectorizationFactor VF =
8178       LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF));
8179 
8180   // If we are stress testing VPlan builds, do not attempt to generate vector
8181   // code. Masked vector code generation support will follow soon.
8182   // Also, do not attempt to vectorize if no vector code will be produced.
8183   if (VPlanBuildStressTest || EnableVPlanPredication ||
8184       VectorizationFactor::Disabled() == VF)
8185     return false;
8186 
8187   LVP.setBestPlan(VF.Width, 1);
8188 
8189   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8190                          &CM, BFI, PSI);
8191   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8192                     << L->getHeader()->getParent()->getName() << "\"\n");
8193   LVP.executePlan(LB, DT);
8194 
8195   // Mark the loop as already vectorized to avoid vectorizing again.
8196   Hints.setAlreadyVectorized();
8197 
8198   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8199   return true;
8200 }
8201 
8202 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8203     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8204                                !EnableLoopInterleaving),
8205       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8206                               !EnableLoopVectorization) {}
8207 
8208 bool LoopVectorizePass::processLoop(Loop *L) {
8209   assert((EnableVPlanNativePath || L->empty()) &&
8210          "VPlan-native path is not enabled. Only process inner loops.");
8211 
8212 #ifndef NDEBUG
8213   const std::string DebugLocStr = getDebugLocString(L);
8214 #endif /* NDEBUG */
8215 
8216   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8217                     << L->getHeader()->getParent()->getName() << "\" from "
8218                     << DebugLocStr << "\n");
8219 
8220   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8221 
8222   LLVM_DEBUG(
8223       dbgs() << "LV: Loop hints:"
8224              << " force="
8225              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8226                      ? "disabled"
8227                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8228                             ? "enabled"
8229                             : "?"))
8230              << " width=" << Hints.getWidth()
8231              << " unroll=" << Hints.getInterleave() << "\n");
8232 
8233   // Function containing loop
8234   Function *F = L->getHeader()->getParent();
8235 
8236   // Looking at the diagnostic output is the only way to determine if a loop
8237   // was vectorized (other than looking at the IR or machine code), so it
8238   // is important to generate an optimization remark for each loop. Most of
8239   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8240   // generated as OptimizationRemark and OptimizationRemarkMissed are
8241   // less verbose reporting vectorized loops and unvectorized loops that may
8242   // benefit from vectorization, respectively.
8243 
8244   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8245     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8246     return false;
8247   }
8248 
8249   PredicatedScalarEvolution PSE(*SE, *L);
8250 
8251   // Check if it is legal to vectorize the loop.
8252   LoopVectorizationRequirements Requirements(*ORE);
8253   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
8254                                 &Requirements, &Hints, DB, AC, BFI, PSI);
8255   if (!LVL.canVectorize(EnableVPlanNativePath)) {
8256     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8257     Hints.emitRemarkWithHints();
8258     return false;
8259   }
8260 
8261   // Check the function attributes and profiles to find out if this function
8262   // should be optimized for size.
8263   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8264       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
8265 
8266   // Entrance to the VPlan-native vectorization path. Outer loops are processed
8267   // here. They may require CFG and instruction level transformations before
8268   // even evaluating whether vectorization is profitable. Since we cannot modify
8269   // the incoming IR, we need to build VPlan upfront in the vectorization
8270   // pipeline.
8271   if (!L->empty())
8272     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
8273                                         ORE, BFI, PSI, Hints);
8274 
8275   assert(L->empty() && "Inner loop expected.");
8276 
8277   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8278   // count by optimizing for size, to minimize overheads.
8279   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
8280   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
8281     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
8282                       << "This loop is worth vectorizing only if no scalar "
8283                       << "iteration overheads are incurred.");
8284     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
8285       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
8286     else {
8287       LLVM_DEBUG(dbgs() << "\n");
8288       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
8289     }
8290   }
8291 
8292   // Check the function attributes to see if implicit floats are allowed.
8293   // FIXME: This check doesn't seem possibly correct -- what if the loop is
8294   // an integer loop and the vector instructions selected are purely integer
8295   // vector instructions?
8296   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
8297     reportVectorizationFailure(
8298         "Can't vectorize when the NoImplicitFloat attribute is used",
8299         "loop not vectorized due to NoImplicitFloat attribute",
8300         "NoImplicitFloat", ORE, L);
8301     Hints.emitRemarkWithHints();
8302     return false;
8303   }
8304 
8305   // Check if the target supports potentially unsafe FP vectorization.
8306   // FIXME: Add a check for the type of safety issue (denormal, signaling)
8307   // for the target we're vectorizing for, to make sure none of the
8308   // additional fp-math flags can help.
8309   if (Hints.isPotentiallyUnsafe() &&
8310       TTI->isFPVectorizationPotentiallyUnsafe()) {
8311     reportVectorizationFailure(
8312         "Potentially unsafe FP op prevents vectorization",
8313         "loop not vectorized due to unsafe FP support.",
8314         "UnsafeFP", ORE, L);
8315     Hints.emitRemarkWithHints();
8316     return false;
8317   }
8318 
8319   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
8320   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
8321 
8322   // If an override option has been passed in for interleaved accesses, use it.
8323   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
8324     UseInterleaved = EnableInterleavedMemAccesses;
8325 
8326   // Analyze interleaved memory accesses.
8327   if (UseInterleaved) {
8328     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
8329   }
8330 
8331   // Use the cost model.
8332   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
8333                                 F, &Hints, IAI);
8334   CM.collectValuesToIgnore();
8335 
8336   // Use the planner for vectorization.
8337   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
8338 
8339   // Get user vectorization factor and interleave count.
8340   unsigned UserVF = Hints.getWidth();
8341   unsigned UserIC = Hints.getInterleave();
8342 
8343   // Plan how to best vectorize, return the best VF and its cost.
8344   Optional<VectorizationFactor> MaybeVF =
8345       LVP.plan(ElementCount::getFixed(UserVF), UserIC);
8346 
8347   VectorizationFactor VF = VectorizationFactor::Disabled();
8348   unsigned IC = 1;
8349 
8350   if (MaybeVF) {
8351     VF = *MaybeVF;
8352     // Select the interleave count.
8353     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
8354   }
8355 
8356   // Identify the diagnostic messages that should be produced.
8357   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8358   bool VectorizeLoop = true, InterleaveLoop = true;
8359   if (Requirements.doesNotMeet(F, L, Hints)) {
8360     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
8361                          "requirements.\n");
8362     Hints.emitRemarkWithHints();
8363     return false;
8364   }
8365 
8366   if (VF.Width == 1) {
8367     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8368     VecDiagMsg = std::make_pair(
8369         "VectorizationNotBeneficial",
8370         "the cost-model indicates that vectorization is not beneficial");
8371     VectorizeLoop = false;
8372   }
8373 
8374   if (!MaybeVF && UserIC > 1) {
8375     // Tell the user interleaving was avoided up-front, despite being explicitly
8376     // requested.
8377     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
8378                          "interleaving should be avoided up front\n");
8379     IntDiagMsg = std::make_pair(
8380         "InterleavingAvoided",
8381         "Ignoring UserIC, because interleaving was avoided up front");
8382     InterleaveLoop = false;
8383   } else if (IC == 1 && UserIC <= 1) {
8384     // Tell the user interleaving is not beneficial.
8385     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8386     IntDiagMsg = std::make_pair(
8387         "InterleavingNotBeneficial",
8388         "the cost-model indicates that interleaving is not beneficial");
8389     InterleaveLoop = false;
8390     if (UserIC == 1) {
8391       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8392       IntDiagMsg.second +=
8393           " and is explicitly disabled or interleave count is set to 1";
8394     }
8395   } else if (IC > 1 && UserIC == 1) {
8396     // Tell the user interleaving is beneficial, but it explicitly disabled.
8397     LLVM_DEBUG(
8398         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
8399     IntDiagMsg = std::make_pair(
8400         "InterleavingBeneficialButDisabled",
8401         "the cost-model indicates that interleaving is beneficial "
8402         "but is explicitly disabled or interleave count is set to 1");
8403     InterleaveLoop = false;
8404   }
8405 
8406   // Override IC if user provided an interleave count.
8407   IC = UserIC > 0 ? UserIC : IC;
8408 
8409   // Emit diagnostic messages, if any.
8410   const char *VAPassName = Hints.vectorizeAnalysisPassName();
8411   if (!VectorizeLoop && !InterleaveLoop) {
8412     // Do not vectorize or interleaving the loop.
8413     ORE->emit([&]() {
8414       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
8415                                       L->getStartLoc(), L->getHeader())
8416              << VecDiagMsg.second;
8417     });
8418     ORE->emit([&]() {
8419       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8420                                       L->getStartLoc(), L->getHeader())
8421              << IntDiagMsg.second;
8422     });
8423     return false;
8424   } else if (!VectorizeLoop && InterleaveLoop) {
8425     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8426     ORE->emit([&]() {
8427       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
8428                                         L->getStartLoc(), L->getHeader())
8429              << VecDiagMsg.second;
8430     });
8431   } else if (VectorizeLoop && !InterleaveLoop) {
8432     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8433                       << ") in " << DebugLocStr << '\n');
8434     ORE->emit([&]() {
8435       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8436                                         L->getStartLoc(), L->getHeader())
8437              << IntDiagMsg.second;
8438     });
8439   } else if (VectorizeLoop && InterleaveLoop) {
8440     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8441                       << ") in " << DebugLocStr << '\n');
8442     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8443   }
8444 
8445   LVP.setBestPlan(VF.Width, IC);
8446 
8447   using namespace ore;
8448   bool DisableRuntimeUnroll = false;
8449   MDNode *OrigLoopID = L->getLoopID();
8450 
8451   if (!VectorizeLoop) {
8452     assert(IC > 1 && "interleave count should not be 1 or 0");
8453     // If we decided that it is not legal to vectorize the loop, then
8454     // interleave it.
8455     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
8456                                BFI, PSI);
8457     LVP.executePlan(Unroller, DT);
8458 
8459     ORE->emit([&]() {
8460       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8461                                 L->getHeader())
8462              << "interleaved loop (interleaved count: "
8463              << NV("InterleaveCount", IC) << ")";
8464     });
8465   } else {
8466     // If we decided that it is *legal* to vectorize the loop, then do it.
8467     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
8468                            &LVL, &CM, BFI, PSI);
8469     LVP.executePlan(LB, DT);
8470     ++LoopsVectorized;
8471 
8472     // Add metadata to disable runtime unrolling a scalar loop when there are
8473     // no runtime checks about strides and memory. A scalar loop that is
8474     // rarely used is not worth unrolling.
8475     if (!LB.areSafetyChecksAdded())
8476       DisableRuntimeUnroll = true;
8477 
8478     // Report the vectorization decision.
8479     ORE->emit([&]() {
8480       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
8481                                 L->getHeader())
8482              << "vectorized loop (vectorization width: "
8483              << NV("VectorizationFactor", VF.Width)
8484              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
8485     });
8486   }
8487 
8488   Optional<MDNode *> RemainderLoopID =
8489       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
8490                                       LLVMLoopVectorizeFollowupEpilogue});
8491   if (RemainderLoopID.hasValue()) {
8492     L->setLoopID(RemainderLoopID.getValue());
8493   } else {
8494     if (DisableRuntimeUnroll)
8495       AddRuntimeUnrollDisableMetaData(L);
8496 
8497     // Mark the loop as already vectorized to avoid vectorizing again.
8498     Hints.setAlreadyVectorized();
8499   }
8500 
8501   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8502   return true;
8503 }
8504 
8505 LoopVectorizeResult LoopVectorizePass::runImpl(
8506     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8507     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8508     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
8509     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8510     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
8511   SE = &SE_;
8512   LI = &LI_;
8513   TTI = &TTI_;
8514   DT = &DT_;
8515   BFI = &BFI_;
8516   TLI = TLI_;
8517   AA = &AA_;
8518   AC = &AC_;
8519   GetLAA = &GetLAA_;
8520   DB = &DB_;
8521   ORE = &ORE_;
8522   PSI = PSI_;
8523 
8524   // Don't attempt if
8525   // 1. the target claims to have no vector registers, and
8526   // 2. interleaving won't help ILP.
8527   //
8528   // The second condition is necessary because, even if the target has no
8529   // vector registers, loop vectorization may still enable scalar
8530   // interleaving.
8531   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8532       TTI->getMaxInterleaveFactor(1) < 2)
8533     return LoopVectorizeResult(false, false);
8534 
8535   bool Changed = false, CFGChanged = false;
8536 
8537   // The vectorizer requires loops to be in simplified form.
8538   // Since simplification may add new inner loops, it has to run before the
8539   // legality and profitability checks. This means running the loop vectorizer
8540   // will simplify all loops, regardless of whether anything end up being
8541   // vectorized.
8542   for (auto &L : *LI)
8543     Changed |= CFGChanged |=
8544         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8545 
8546   // Build up a worklist of inner-loops to vectorize. This is necessary as
8547   // the act of vectorizing or partially unrolling a loop creates new loops
8548   // and can invalidate iterators across the loops.
8549   SmallVector<Loop *, 8> Worklist;
8550 
8551   for (Loop *L : *LI)
8552     collectSupportedLoops(*L, LI, ORE, Worklist);
8553 
8554   LoopsAnalyzed += Worklist.size();
8555 
8556   // Now walk the identified inner loops.
8557   while (!Worklist.empty()) {
8558     Loop *L = Worklist.pop_back_val();
8559 
8560     // For the inner loops we actually process, form LCSSA to simplify the
8561     // transform.
8562     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8563 
8564     Changed |= CFGChanged |= processLoop(L);
8565   }
8566 
8567   // Process each loop nest in the function.
8568   return LoopVectorizeResult(Changed, CFGChanged);
8569 }
8570 
8571 PreservedAnalyses LoopVectorizePass::run(Function &F,
8572                                          FunctionAnalysisManager &AM) {
8573     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8574     auto &LI = AM.getResult<LoopAnalysis>(F);
8575     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8576     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8577     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8578     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8579     auto &AA = AM.getResult<AAManager>(F);
8580     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8581     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8582     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8583     MemorySSA *MSSA = EnableMSSALoopDependency
8584                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8585                           : nullptr;
8586 
8587     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8588     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8589         [&](Loop &L) -> const LoopAccessInfo & {
8590       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
8591       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8592     };
8593     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8594     ProfileSummaryInfo *PSI =
8595         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8596     LoopVectorizeResult Result =
8597         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8598     if (!Result.MadeAnyChange)
8599       return PreservedAnalyses::all();
8600     PreservedAnalyses PA;
8601 
8602     // We currently do not preserve loopinfo/dominator analyses with outer loop
8603     // vectorization. Until this is addressed, mark these analyses as preserved
8604     // only for non-VPlan-native path.
8605     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8606     if (!EnableVPlanNativePath) {
8607       PA.preserve<LoopAnalysis>();
8608       PA.preserve<DominatorTreeAnalysis>();
8609     }
8610     PA.preserve<BasicAA>();
8611     PA.preserve<GlobalsAA>();
8612     if (!Result.MadeCFGChange)
8613       PA.preserveSet<CFGAnalyses>();
8614     return PA;
8615 }
8616