1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
182 // that predication is preferred, and this lists all options. I.e., the
183 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
184 // and predicate the instructions accordingly. If tail-folding fails, there are
185 // different fallback strategies depending on these values:
186 namespace PreferPredicateTy {
187   enum Option {
188     ScalarEpilogue = 0,
189     PredicateElseScalarEpilogue,
190     PredicateOrDontVectorize
191   };
192 }
193 
194 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
195     "prefer-predicate-over-epilogue",
196     cl::init(PreferPredicateTy::ScalarEpilogue),
197     cl::Hidden,
198     cl::desc("Tail-folding and predication preferences over creating a scalar "
199              "epilogue loop."),
200     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
201                          "scalar-epilogue",
202                          "Don't tail-predicate loops, create scalar epilogue"),
203               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
204                          "predicate-else-scalar-epilogue",
205                          "prefer tail-folding, create scalar epilogue if tail "
206                          "folding fails."),
207               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
208                          "predicate-dont-vectorize",
209                          "prefers tail-folding, don't attempt vectorization if "
210                          "tail-folding fails.")));
211 
212 static cl::opt<bool> MaximizeBandwidth(
213     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
214     cl::desc("Maximize bandwidth when selecting vectorization factor which "
215              "will be determined by the smallest type in loop."));
216 
217 static cl::opt<bool> EnableInterleavedMemAccesses(
218     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
219     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
220 
221 /// An interleave-group may need masking if it resides in a block that needs
222 /// predication, or in order to mask away gaps.
223 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
224     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
225     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
226 
227 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
228     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
229     cl::desc("We don't interleave loops with a estimated constant trip count "
230              "below this number"));
231 
232 static cl::opt<unsigned> ForceTargetNumScalarRegs(
233     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
234     cl::desc("A flag that overrides the target's number of scalar registers."));
235 
236 static cl::opt<unsigned> ForceTargetNumVectorRegs(
237     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
238     cl::desc("A flag that overrides the target's number of vector registers."));
239 
240 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
241     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
242     cl::desc("A flag that overrides the target's max interleave factor for "
243              "scalar loops."));
244 
245 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
246     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
247     cl::desc("A flag that overrides the target's max interleave factor for "
248              "vectorized loops."));
249 
250 static cl::opt<unsigned> ForceTargetInstructionCost(
251     "force-target-instruction-cost", cl::init(0), cl::Hidden,
252     cl::desc("A flag that overrides the target's expected cost for "
253              "an instruction to a single constant value. Mostly "
254              "useful for getting consistent testing."));
255 
256 static cl::opt<unsigned> SmallLoopCost(
257     "small-loop-cost", cl::init(20), cl::Hidden,
258     cl::desc(
259         "The cost of a loop that is considered 'small' by the interleaver."));
260 
261 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
262     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
263     cl::desc("Enable the use of the block frequency analysis to access PGO "
264              "heuristics minimizing code growth in cold regions and being more "
265              "aggressive in hot regions."));
266 
267 // Runtime interleave loops for load/store throughput.
268 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
269     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
270     cl::desc(
271         "Enable runtime interleaving until load/store ports are saturated"));
272 
273 /// The number of stores in a loop that are allowed to need predication.
274 static cl::opt<unsigned> NumberOfStoresToPredicate(
275     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
276     cl::desc("Max number of stores to be predicated behind an if."));
277 
278 static cl::opt<bool> EnableIndVarRegisterHeur(
279     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
280     cl::desc("Count the induction variable only once when interleaving"));
281 
282 static cl::opt<bool> EnableCondStoresVectorization(
283     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
284     cl::desc("Enable if predication of stores during vectorization."));
285 
286 static cl::opt<unsigned> MaxNestedScalarReductionIC(
287     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
288     cl::desc("The maximum interleave count to use when interleaving a scalar "
289              "reduction in a nested loop."));
290 
291 static cl::opt<bool>
292     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
293                            cl::Hidden,
294                            cl::desc("Prefer in-loop vector reductions, "
295                                     "overriding the targets preference."));
296 
297 static cl::opt<bool> PreferPredicatedReductionSelect(
298     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
299     cl::desc(
300         "Prefer predicating a reduction operation over an after loop select."));
301 
302 cl::opt<bool> EnableVPlanNativePath(
303     "enable-vplan-native-path", cl::init(false), cl::Hidden,
304     cl::desc("Enable VPlan-native vectorization path with "
305              "support for outer loop vectorization."));
306 
307 // FIXME: Remove this switch once we have divergence analysis. Currently we
308 // assume divergent non-backedge branches when this switch is true.
309 cl::opt<bool> EnableVPlanPredication(
310     "enable-vplan-predication", cl::init(false), cl::Hidden,
311     cl::desc("Enable VPlan-native vectorization path predicator with "
312              "support for outer loop vectorization."));
313 
314 // This flag enables the stress testing of the VPlan H-CFG construction in the
315 // VPlan-native vectorization path. It must be used in conjuction with
316 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
317 // verification of the H-CFGs built.
318 static cl::opt<bool> VPlanBuildStressTest(
319     "vplan-build-stress-test", cl::init(false), cl::Hidden,
320     cl::desc(
321         "Build VPlan for every supported loop nest in the function and bail "
322         "out right after the build (stress test the VPlan H-CFG construction "
323         "in the VPlan-native vectorization path)."));
324 
325 cl::opt<bool> llvm::EnableLoopInterleaving(
326     "interleave-loops", cl::init(true), cl::Hidden,
327     cl::desc("Enable loop interleaving in Loop vectorization passes"));
328 cl::opt<bool> llvm::EnableLoopVectorization(
329     "vectorize-loops", cl::init(true), cl::Hidden,
330     cl::desc("Run the Loop vectorization passes"));
331 
332 /// A helper function that returns the type of loaded or stored value.
333 static Type *getMemInstValueType(Value *I) {
334   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
335          "Expected Load or Store instruction");
336   if (auto *LI = dyn_cast<LoadInst>(I))
337     return LI->getType();
338   return cast<StoreInst>(I)->getValueOperand()->getType();
339 }
340 
341 /// A helper function that returns true if the given type is irregular. The
342 /// type is irregular if its allocated size doesn't equal the store size of an
343 /// element of the corresponding vector type at the given vectorization factor.
344 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
345   assert(!VF.Scalable && "scalable vectors not yet supported.");
346   // Determine if an array of VF elements of type Ty is "bitcast compatible"
347   // with a <VF x Ty> vector.
348   if (VF.isVector()) {
349     auto *VectorTy = VectorType::get(Ty, VF);
350     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
351   }
352 
353   // If the vectorization factor is one, we just check if an array of type Ty
354   // requires padding between elements.
355   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
356 }
357 
358 /// A helper function that returns the reciprocal of the block probability of
359 /// predicated blocks. If we return X, we are assuming the predicated block
360 /// will execute once for every X iterations of the loop header.
361 ///
362 /// TODO: We should use actual block probability here, if available. Currently,
363 ///       we always assume predicated blocks have a 50% chance of executing.
364 static unsigned getReciprocalPredBlockProb() { return 2; }
365 
366 /// A helper function that adds a 'fast' flag to floating-point operations.
367 static Value *addFastMathFlag(Value *V) {
368   if (isa<FPMathOperator>(V))
369     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
370   return V;
371 }
372 
373 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
374   if (isa<FPMathOperator>(V))
375     cast<Instruction>(V)->setFastMathFlags(FMF);
376   return V;
377 }
378 
379 /// A helper function that returns an integer or floating-point constant with
380 /// value C.
381 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
382   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
383                            : ConstantFP::get(Ty, C);
384 }
385 
386 /// Returns "best known" trip count for the specified loop \p L as defined by
387 /// the following procedure:
388 ///   1) Returns exact trip count if it is known.
389 ///   2) Returns expected trip count according to profile data if any.
390 ///   3) Returns upper bound estimate if it is known.
391 ///   4) Returns None if all of the above failed.
392 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
393   // Check if exact trip count is known.
394   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
395     return ExpectedTC;
396 
397   // Check if there is an expected trip count available from profile data.
398   if (LoopVectorizeWithBlockFrequency)
399     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
400       return EstimatedTC;
401 
402   // Check if upper bound estimate is known.
403   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
404     return ExpectedTC;
405 
406   return None;
407 }
408 
409 namespace llvm {
410 
411 /// InnerLoopVectorizer vectorizes loops which contain only one basic
412 /// block to a specified vectorization factor (VF).
413 /// This class performs the widening of scalars into vectors, or multiple
414 /// scalars. This class also implements the following features:
415 /// * It inserts an epilogue loop for handling loops that don't have iteration
416 ///   counts that are known to be a multiple of the vectorization factor.
417 /// * It handles the code generation for reduction variables.
418 /// * Scalarization (implementation using scalars) of un-vectorizable
419 ///   instructions.
420 /// InnerLoopVectorizer does not perform any vectorization-legality
421 /// checks, and relies on the caller to check for the different legality
422 /// aspects. The InnerLoopVectorizer relies on the
423 /// LoopVectorizationLegality class to provide information about the induction
424 /// and reduction variables that were found to a given vectorization factor.
425 class InnerLoopVectorizer {
426 public:
427   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
428                       LoopInfo *LI, DominatorTree *DT,
429                       const TargetLibraryInfo *TLI,
430                       const TargetTransformInfo *TTI, AssumptionCache *AC,
431                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
432                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
433                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
434                       ProfileSummaryInfo *PSI)
435       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
436         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
437         Builder(PSE.getSE()->getContext()),
438         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
439         BFI(BFI), PSI(PSI) {
440     // Query this against the original loop and save it here because the profile
441     // of the original loop header may change as the transformation happens.
442     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
443         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
444   }
445 
446   virtual ~InnerLoopVectorizer() = default;
447 
448   /// Create a new empty loop that will contain vectorized instructions later
449   /// on, while the old loop will be used as the scalar remainder. Control flow
450   /// is generated around the vectorized (and scalar epilogue) loops consisting
451   /// of various checks and bypasses. Return the pre-header block of the new
452   /// loop.
453   BasicBlock *createVectorizedLoopSkeleton();
454 
455   /// Widen a single instruction within the innermost loop.
456   void widenInstruction(Instruction &I, VPUser &Operands,
457                         VPTransformState &State);
458 
459   /// Widen a single call instruction within the innermost loop.
460   void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
461                             VPTransformState &State);
462 
463   /// Widen a single select instruction within the innermost loop.
464   void widenSelectInstruction(SelectInst &I, VPUser &Operands,
465                               bool InvariantCond, VPTransformState &State);
466 
467   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
468   void fixVectorizedLoop();
469 
470   // Return true if any runtime check is added.
471   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
472 
473   /// A type for vectorized values in the new loop. Each value from the
474   /// original loop, when vectorized, is represented by UF vector values in the
475   /// new unrolled loop, where UF is the unroll factor.
476   using VectorParts = SmallVector<Value *, 2>;
477 
478   /// Vectorize a single GetElementPtrInst based on information gathered and
479   /// decisions taken during planning.
480   void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
481                 ElementCount VF, bool IsPtrLoopInvariant,
482                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
483 
484   /// Vectorize a single PHINode in a block. This method handles the induction
485   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
486   /// arbitrary length vectors.
487   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
488 
489   /// A helper function to scalarize a single Instruction in the innermost loop.
490   /// Generates a sequence of scalar instances for each lane between \p MinLane
491   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
492   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
493   /// Instr's operands.
494   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
495                             const VPIteration &Instance, bool IfPredicateInstr,
496                             VPTransformState &State);
497 
498   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
499   /// is provided, the integer induction variable will first be truncated to
500   /// the corresponding type.
501   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
502 
503   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
504   /// vector or scalar value on-demand if one is not yet available. When
505   /// vectorizing a loop, we visit the definition of an instruction before its
506   /// uses. When visiting the definition, we either vectorize or scalarize the
507   /// instruction, creating an entry for it in the corresponding map. (In some
508   /// cases, such as induction variables, we will create both vector and scalar
509   /// entries.) Then, as we encounter uses of the definition, we derive values
510   /// for each scalar or vector use unless such a value is already available.
511   /// For example, if we scalarize a definition and one of its uses is vector,
512   /// we build the required vector on-demand with an insertelement sequence
513   /// when visiting the use. Otherwise, if the use is scalar, we can use the
514   /// existing scalar definition.
515   ///
516   /// Return a value in the new loop corresponding to \p V from the original
517   /// loop at unroll index \p Part. If the value has already been vectorized,
518   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
519   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
520   /// a new vector value on-demand by inserting the scalar values into a vector
521   /// with an insertelement sequence. If the value has been neither vectorized
522   /// nor scalarized, it must be loop invariant, so we simply broadcast the
523   /// value into a vector.
524   Value *getOrCreateVectorValue(Value *V, unsigned Part);
525 
526   /// Return a value in the new loop corresponding to \p V from the original
527   /// loop at unroll and vector indices \p Instance. If the value has been
528   /// vectorized but not scalarized, the necessary extractelement instruction
529   /// will be generated.
530   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
531 
532   /// Construct the vector value of a scalarized value \p V one lane at a time.
533   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
534 
535   /// Try to vectorize interleaved access group \p Group with the base address
536   /// given in \p Addr, optionally masking the vector operations if \p
537   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
538   /// values in the vectorized loop.
539   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
540                                 VPTransformState &State, VPValue *Addr,
541                                 VPValue *BlockInMask = nullptr);
542 
543   /// Vectorize Load and Store instructions with the base address given in \p
544   /// Addr, optionally masking the vector operations if \p BlockInMask is
545   /// non-null. Use \p State to translate given VPValues to IR values in the
546   /// vectorized loop.
547   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
548                                   VPValue *Addr, VPValue *StoredValue,
549                                   VPValue *BlockInMask);
550 
551   /// Set the debug location in the builder using the debug location in
552   /// the instruction.
553   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
554 
555   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
556   void fixNonInductionPHIs(void);
557 
558 protected:
559   friend class LoopVectorizationPlanner;
560 
561   /// A small list of PHINodes.
562   using PhiVector = SmallVector<PHINode *, 4>;
563 
564   /// A type for scalarized values in the new loop. Each value from the
565   /// original loop, when scalarized, is represented by UF x VF scalar values
566   /// in the new unrolled loop, where UF is the unroll factor and VF is the
567   /// vectorization factor.
568   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
569 
570   /// Set up the values of the IVs correctly when exiting the vector loop.
571   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
572                     Value *CountRoundDown, Value *EndValue,
573                     BasicBlock *MiddleBlock);
574 
575   /// Create a new induction variable inside L.
576   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
577                                    Value *Step, Instruction *DL);
578 
579   /// Handle all cross-iteration phis in the header.
580   void fixCrossIterationPHIs();
581 
582   /// Fix a first-order recurrence. This is the second phase of vectorizing
583   /// this phi node.
584   void fixFirstOrderRecurrence(PHINode *Phi);
585 
586   /// Fix a reduction cross-iteration phi. This is the second phase of
587   /// vectorizing this phi node.
588   void fixReduction(PHINode *Phi);
589 
590   /// Clear NSW/NUW flags from reduction instructions if necessary.
591   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
592 
593   /// The Loop exit block may have single value PHI nodes with some
594   /// incoming value. While vectorizing we only handled real values
595   /// that were defined inside the loop and we should have one value for
596   /// each predecessor of its parent basic block. See PR14725.
597   void fixLCSSAPHIs();
598 
599   /// Iteratively sink the scalarized operands of a predicated instruction into
600   /// the block that was created for it.
601   void sinkScalarOperands(Instruction *PredInst);
602 
603   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
604   /// represented as.
605   void truncateToMinimalBitwidths();
606 
607   /// Create a broadcast instruction. This method generates a broadcast
608   /// instruction (shuffle) for loop invariant values and for the induction
609   /// value. If this is the induction variable then we extend it to N, N+1, ...
610   /// this is needed because each iteration in the loop corresponds to a SIMD
611   /// element.
612   virtual Value *getBroadcastInstrs(Value *V);
613 
614   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
615   /// to each vector element of Val. The sequence starts at StartIndex.
616   /// \p Opcode is relevant for FP induction variable.
617   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
618                                Instruction::BinaryOps Opcode =
619                                Instruction::BinaryOpsEnd);
620 
621   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
622   /// variable on which to base the steps, \p Step is the size of the step, and
623   /// \p EntryVal is the value from the original loop that maps to the steps.
624   /// Note that \p EntryVal doesn't have to be an induction variable - it
625   /// can also be a truncate instruction.
626   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
627                         const InductionDescriptor &ID);
628 
629   /// Create a vector induction phi node based on an existing scalar one. \p
630   /// EntryVal is the value from the original loop that maps to the vector phi
631   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
632   /// truncate instruction, instead of widening the original IV, we widen a
633   /// version of the IV truncated to \p EntryVal's type.
634   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
635                                        Value *Step, Instruction *EntryVal);
636 
637   /// Returns true if an instruction \p I should be scalarized instead of
638   /// vectorized for the chosen vectorization factor.
639   bool shouldScalarizeInstruction(Instruction *I) const;
640 
641   /// Returns true if we should generate a scalar version of \p IV.
642   bool needsScalarInduction(Instruction *IV) const;
643 
644   /// If there is a cast involved in the induction variable \p ID, which should
645   /// be ignored in the vectorized loop body, this function records the
646   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
647   /// cast. We had already proved that the casted Phi is equal to the uncasted
648   /// Phi in the vectorized loop (under a runtime guard), and therefore
649   /// there is no need to vectorize the cast - the same value can be used in the
650   /// vector loop for both the Phi and the cast.
651   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
652   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
653   ///
654   /// \p EntryVal is the value from the original loop that maps to the vector
655   /// phi node and is used to distinguish what is the IV currently being
656   /// processed - original one (if \p EntryVal is a phi corresponding to the
657   /// original IV) or the "newly-created" one based on the proof mentioned above
658   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
659   /// latter case \p EntryVal is a TruncInst and we must not record anything for
660   /// that IV, but it's error-prone to expect callers of this routine to care
661   /// about that, hence this explicit parameter.
662   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
663                                              const Instruction *EntryVal,
664                                              Value *VectorLoopValue,
665                                              unsigned Part,
666                                              unsigned Lane = UINT_MAX);
667 
668   /// Generate a shuffle sequence that will reverse the vector Vec.
669   virtual Value *reverseVector(Value *Vec);
670 
671   /// Returns (and creates if needed) the original loop trip count.
672   Value *getOrCreateTripCount(Loop *NewLoop);
673 
674   /// Returns (and creates if needed) the trip count of the widened loop.
675   Value *getOrCreateVectorTripCount(Loop *NewLoop);
676 
677   /// Returns a bitcasted value to the requested vector type.
678   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
679   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
680                                 const DataLayout &DL);
681 
682   /// Emit a bypass check to see if the vector trip count is zero, including if
683   /// it overflows.
684   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
685 
686   /// Emit a bypass check to see if all of the SCEV assumptions we've
687   /// had to make are correct.
688   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
689 
690   /// Emit bypass checks to check any memory assumptions we may have made.
691   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
692 
693   /// Compute the transformed value of Index at offset StartValue using step
694   /// StepValue.
695   /// For integer induction, returns StartValue + Index * StepValue.
696   /// For pointer induction, returns StartValue[Index * StepValue].
697   /// FIXME: The newly created binary instructions should contain nsw/nuw
698   /// flags, which can be found from the original scalar operations.
699   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
700                               const DataLayout &DL,
701                               const InductionDescriptor &ID) const;
702 
703   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
704   /// vector loop preheader, middle block and scalar preheader. Also
705   /// allocate a loop object for the new vector loop and return it.
706   Loop *createVectorLoopSkeleton(StringRef Prefix);
707 
708   /// Create new phi nodes for the induction variables to resume iteration count
709   /// in the scalar epilogue, from where the vectorized loop left off (given by
710   /// \p VectorTripCount).
711   void createInductionResumeValues(Loop *L, Value *VectorTripCount);
712 
713   /// Complete the loop skeleton by adding debug MDs, creating appropriate
714   /// conditional branches in the middle block, preparing the builder and
715   /// running the verifier. Take in the vector loop \p L as argument, and return
716   /// the preheader of the completed vector loop.
717   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
718 
719   /// Add additional metadata to \p To that was not present on \p Orig.
720   ///
721   /// Currently this is used to add the noalias annotations based on the
722   /// inserted memchecks.  Use this for instructions that are *cloned* into the
723   /// vector loop.
724   void addNewMetadata(Instruction *To, const Instruction *Orig);
725 
726   /// Add metadata from one instruction to another.
727   ///
728   /// This includes both the original MDs from \p From and additional ones (\see
729   /// addNewMetadata).  Use this for *newly created* instructions in the vector
730   /// loop.
731   void addMetadata(Instruction *To, Instruction *From);
732 
733   /// Similar to the previous function but it adds the metadata to a
734   /// vector of instructions.
735   void addMetadata(ArrayRef<Value *> To, Instruction *From);
736 
737   /// The original loop.
738   Loop *OrigLoop;
739 
740   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
741   /// dynamic knowledge to simplify SCEV expressions and converts them to a
742   /// more usable form.
743   PredicatedScalarEvolution &PSE;
744 
745   /// Loop Info.
746   LoopInfo *LI;
747 
748   /// Dominator Tree.
749   DominatorTree *DT;
750 
751   /// Alias Analysis.
752   AAResults *AA;
753 
754   /// Target Library Info.
755   const TargetLibraryInfo *TLI;
756 
757   /// Target Transform Info.
758   const TargetTransformInfo *TTI;
759 
760   /// Assumption Cache.
761   AssumptionCache *AC;
762 
763   /// Interface to emit optimization remarks.
764   OptimizationRemarkEmitter *ORE;
765 
766   /// LoopVersioning.  It's only set up (non-null) if memchecks were
767   /// used.
768   ///
769   /// This is currently only used to add no-alias metadata based on the
770   /// memchecks.  The actually versioning is performed manually.
771   std::unique_ptr<LoopVersioning> LVer;
772 
773   /// The vectorization SIMD factor to use. Each vector will have this many
774   /// vector elements.
775   ElementCount VF;
776 
777   /// The vectorization unroll factor to use. Each scalar is vectorized to this
778   /// many different vector instructions.
779   unsigned UF;
780 
781   /// The builder that we use
782   IRBuilder<> Builder;
783 
784   // --- Vectorization state ---
785 
786   /// The vector-loop preheader.
787   BasicBlock *LoopVectorPreHeader;
788 
789   /// The scalar-loop preheader.
790   BasicBlock *LoopScalarPreHeader;
791 
792   /// Middle Block between the vector and the scalar.
793   BasicBlock *LoopMiddleBlock;
794 
795   /// The ExitBlock of the scalar loop.
796   BasicBlock *LoopExitBlock;
797 
798   /// The vector loop body.
799   BasicBlock *LoopVectorBody;
800 
801   /// The scalar loop body.
802   BasicBlock *LoopScalarBody;
803 
804   /// A list of all bypass blocks. The first block is the entry of the loop.
805   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
806 
807   /// The new Induction variable which was added to the new block.
808   PHINode *Induction = nullptr;
809 
810   /// The induction variable of the old basic block.
811   PHINode *OldInduction = nullptr;
812 
813   /// Maps values from the original loop to their corresponding values in the
814   /// vectorized loop. A key value can map to either vector values, scalar
815   /// values or both kinds of values, depending on whether the key was
816   /// vectorized and scalarized.
817   VectorizerValueMap VectorLoopValueMap;
818 
819   /// Store instructions that were predicated.
820   SmallVector<Instruction *, 4> PredicatedInstructions;
821 
822   /// Trip count of the original loop.
823   Value *TripCount = nullptr;
824 
825   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
826   Value *VectorTripCount = nullptr;
827 
828   /// The legality analysis.
829   LoopVectorizationLegality *Legal;
830 
831   /// The profitablity analysis.
832   LoopVectorizationCostModel *Cost;
833 
834   // Record whether runtime checks are added.
835   bool AddedSafetyChecks = false;
836 
837   // Holds the end values for each induction variable. We save the end values
838   // so we can later fix-up the external users of the induction variables.
839   DenseMap<PHINode *, Value *> IVEndValues;
840 
841   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
842   // fixed up at the end of vector code generation.
843   SmallVector<PHINode *, 8> OrigPHIsToFix;
844 
845   /// BFI and PSI are used to check for profile guided size optimizations.
846   BlockFrequencyInfo *BFI;
847   ProfileSummaryInfo *PSI;
848 
849   // Whether this loop should be optimized for size based on profile guided size
850   // optimizatios.
851   bool OptForSizeBasedOnProfile;
852 };
853 
854 class InnerLoopUnroller : public InnerLoopVectorizer {
855 public:
856   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
857                     LoopInfo *LI, DominatorTree *DT,
858                     const TargetLibraryInfo *TLI,
859                     const TargetTransformInfo *TTI, AssumptionCache *AC,
860                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
861                     LoopVectorizationLegality *LVL,
862                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
863                     ProfileSummaryInfo *PSI)
864       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
865                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
866                             BFI, PSI) {}
867 
868 private:
869   Value *getBroadcastInstrs(Value *V) override;
870   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
871                        Instruction::BinaryOps Opcode =
872                        Instruction::BinaryOpsEnd) override;
873   Value *reverseVector(Value *Vec) override;
874 };
875 
876 } // end namespace llvm
877 
878 /// Look for a meaningful debug location on the instruction or it's
879 /// operands.
880 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
881   if (!I)
882     return I;
883 
884   DebugLoc Empty;
885   if (I->getDebugLoc() != Empty)
886     return I;
887 
888   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
889     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
890       if (OpInst->getDebugLoc() != Empty)
891         return OpInst;
892   }
893 
894   return I;
895 }
896 
897 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
898   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
899     const DILocation *DIL = Inst->getDebugLoc();
900     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
901         !isa<DbgInfoIntrinsic>(Inst)) {
902       assert(!VF.Scalable && "scalable vectors not yet supported.");
903       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF.Min);
904       if (NewDIL)
905         B.SetCurrentDebugLocation(NewDIL.getValue());
906       else
907         LLVM_DEBUG(dbgs()
908                    << "Failed to create new discriminator: "
909                    << DIL->getFilename() << " Line: " << DIL->getLine());
910     }
911     else
912       B.SetCurrentDebugLocation(DIL);
913   } else
914     B.SetCurrentDebugLocation(DebugLoc());
915 }
916 
917 /// Write a record \p DebugMsg about vectorization failure to the debug
918 /// output stream. If \p I is passed, it is an instruction that prevents
919 /// vectorization.
920 #ifndef NDEBUG
921 static void debugVectorizationFailure(const StringRef DebugMsg,
922     Instruction *I) {
923   dbgs() << "LV: Not vectorizing: " << DebugMsg;
924   if (I != nullptr)
925     dbgs() << " " << *I;
926   else
927     dbgs() << '.';
928   dbgs() << '\n';
929 }
930 #endif
931 
932 /// Create an analysis remark that explains why vectorization failed
933 ///
934 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
935 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
936 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
937 /// the location of the remark.  \return the remark object that can be
938 /// streamed to.
939 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
940     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
941   Value *CodeRegion = TheLoop->getHeader();
942   DebugLoc DL = TheLoop->getStartLoc();
943 
944   if (I) {
945     CodeRegion = I->getParent();
946     // If there is no debug location attached to the instruction, revert back to
947     // using the loop's.
948     if (I->getDebugLoc())
949       DL = I->getDebugLoc();
950   }
951 
952   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
953   R << "loop not vectorized: ";
954   return R;
955 }
956 
957 namespace llvm {
958 
959 void reportVectorizationFailure(const StringRef DebugMsg,
960     const StringRef OREMsg, const StringRef ORETag,
961     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
962   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
963   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
964   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
965                 ORETag, TheLoop, I) << OREMsg);
966 }
967 
968 } // end namespace llvm
969 
970 #ifndef NDEBUG
971 /// \return string containing a file name and a line # for the given loop.
972 static std::string getDebugLocString(const Loop *L) {
973   std::string Result;
974   if (L) {
975     raw_string_ostream OS(Result);
976     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
977       LoopDbgLoc.print(OS);
978     else
979       // Just print the module name.
980       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
981     OS.flush();
982   }
983   return Result;
984 }
985 #endif
986 
987 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
988                                          const Instruction *Orig) {
989   // If the loop was versioned with memchecks, add the corresponding no-alias
990   // metadata.
991   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
992     LVer->annotateInstWithNoAlias(To, Orig);
993 }
994 
995 void InnerLoopVectorizer::addMetadata(Instruction *To,
996                                       Instruction *From) {
997   propagateMetadata(To, From);
998   addNewMetadata(To, From);
999 }
1000 
1001 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1002                                       Instruction *From) {
1003   for (Value *V : To) {
1004     if (Instruction *I = dyn_cast<Instruction>(V))
1005       addMetadata(I, From);
1006   }
1007 }
1008 
1009 namespace llvm {
1010 
1011 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1012 // lowered.
1013 enum ScalarEpilogueLowering {
1014 
1015   // The default: allowing scalar epilogues.
1016   CM_ScalarEpilogueAllowed,
1017 
1018   // Vectorization with OptForSize: don't allow epilogues.
1019   CM_ScalarEpilogueNotAllowedOptSize,
1020 
1021   // A special case of vectorisation with OptForSize: loops with a very small
1022   // trip count are considered for vectorization under OptForSize, thereby
1023   // making sure the cost of their loop body is dominant, free of runtime
1024   // guards and scalar iteration overheads.
1025   CM_ScalarEpilogueNotAllowedLowTripLoop,
1026 
1027   // Loop hint predicate indicating an epilogue is undesired.
1028   CM_ScalarEpilogueNotNeededUsePredicate
1029 };
1030 
1031 /// LoopVectorizationCostModel - estimates the expected speedups due to
1032 /// vectorization.
1033 /// In many cases vectorization is not profitable. This can happen because of
1034 /// a number of reasons. In this class we mainly attempt to predict the
1035 /// expected speedup/slowdowns due to the supported instruction set. We use the
1036 /// TargetTransformInfo to query the different backends for the cost of
1037 /// different operations.
1038 class LoopVectorizationCostModel {
1039 public:
1040   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1041                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1042                              LoopVectorizationLegality *Legal,
1043                              const TargetTransformInfo &TTI,
1044                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1045                              AssumptionCache *AC,
1046                              OptimizationRemarkEmitter *ORE, const Function *F,
1047                              const LoopVectorizeHints *Hints,
1048                              InterleavedAccessInfo &IAI)
1049       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1050         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1051         Hints(Hints), InterleaveInfo(IAI) {}
1052 
1053   /// \return An upper bound for the vectorization factor, or None if
1054   /// vectorization and interleaving should be avoided up front.
1055   Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
1056 
1057   /// \return True if runtime checks are required for vectorization, and false
1058   /// otherwise.
1059   bool runtimeChecksRequired();
1060 
1061   /// \return The most profitable vectorization factor and the cost of that VF.
1062   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1063   /// then this vectorization factor will be selected if vectorization is
1064   /// possible.
1065   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1066 
1067   /// Setup cost-based decisions for user vectorization factor.
1068   void selectUserVectorizationFactor(ElementCount UserVF) {
1069     collectUniformsAndScalars(UserVF);
1070     collectInstsToScalarize(UserVF);
1071   }
1072 
1073   /// \return The size (in bits) of the smallest and widest types in the code
1074   /// that needs to be vectorized. We ignore values that remain scalar such as
1075   /// 64 bit loop indices.
1076   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1077 
1078   /// \return The desired interleave count.
1079   /// If interleave count has been specified by metadata it will be returned.
1080   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1081   /// are the selected vectorization factor and the cost of the selected VF.
1082   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1083 
1084   /// Memory access instruction may be vectorized in more than one way.
1085   /// Form of instruction after vectorization depends on cost.
1086   /// This function takes cost-based decisions for Load/Store instructions
1087   /// and collects them in a map. This decisions map is used for building
1088   /// the lists of loop-uniform and loop-scalar instructions.
1089   /// The calculated cost is saved with widening decision in order to
1090   /// avoid redundant calculations.
1091   void setCostBasedWideningDecision(ElementCount VF);
1092 
1093   /// A struct that represents some properties of the register usage
1094   /// of a loop.
1095   struct RegisterUsage {
1096     /// Holds the number of loop invariant values that are used in the loop.
1097     /// The key is ClassID of target-provided register class.
1098     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1099     /// Holds the maximum number of concurrent live intervals in the loop.
1100     /// The key is ClassID of target-provided register class.
1101     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1102   };
1103 
1104   /// \return Returns information about the register usages of the loop for the
1105   /// given vectorization factors.
1106   SmallVector<RegisterUsage, 8>
1107   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1108 
1109   /// Collect values we want to ignore in the cost model.
1110   void collectValuesToIgnore();
1111 
1112   /// Split reductions into those that happen in the loop, and those that happen
1113   /// outside. In loop reductions are collected into InLoopReductionChains.
1114   void collectInLoopReductions();
1115 
1116   /// \returns The smallest bitwidth each instruction can be represented with.
1117   /// The vector equivalents of these instructions should be truncated to this
1118   /// type.
1119   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1120     return MinBWs;
1121   }
1122 
1123   /// \returns True if it is more profitable to scalarize instruction \p I for
1124   /// vectorization factor \p VF.
1125   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1126     assert(VF.isVector() &&
1127            "Profitable to scalarize relevant only for VF > 1.");
1128 
1129     // Cost model is not run in the VPlan-native path - return conservative
1130     // result until this changes.
1131     if (EnableVPlanNativePath)
1132       return false;
1133 
1134     auto Scalars = InstsToScalarize.find(VF);
1135     assert(Scalars != InstsToScalarize.end() &&
1136            "VF not yet analyzed for scalarization profitability");
1137     return Scalars->second.find(I) != Scalars->second.end();
1138   }
1139 
1140   /// Returns true if \p I is known to be uniform after vectorization.
1141   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1142     if (VF.isScalar())
1143       return true;
1144 
1145     // Cost model is not run in the VPlan-native path - return conservative
1146     // result until this changes.
1147     if (EnableVPlanNativePath)
1148       return false;
1149 
1150     auto UniformsPerVF = Uniforms.find(VF);
1151     assert(UniformsPerVF != Uniforms.end() &&
1152            "VF not yet analyzed for uniformity");
1153     return UniformsPerVF->second.count(I);
1154   }
1155 
1156   /// Returns true if \p I is known to be scalar after vectorization.
1157   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1158     if (VF.isScalar())
1159       return true;
1160 
1161     // Cost model is not run in the VPlan-native path - return conservative
1162     // result until this changes.
1163     if (EnableVPlanNativePath)
1164       return false;
1165 
1166     auto ScalarsPerVF = Scalars.find(VF);
1167     assert(ScalarsPerVF != Scalars.end() &&
1168            "Scalar values are not calculated for VF");
1169     return ScalarsPerVF->second.count(I);
1170   }
1171 
1172   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1173   /// for vectorization factor \p VF.
1174   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1175     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1176            !isProfitableToScalarize(I, VF) &&
1177            !isScalarAfterVectorization(I, VF);
1178   }
1179 
1180   /// Decision that was taken during cost calculation for memory instruction.
1181   enum InstWidening {
1182     CM_Unknown,
1183     CM_Widen,         // For consecutive accesses with stride +1.
1184     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1185     CM_Interleave,
1186     CM_GatherScatter,
1187     CM_Scalarize
1188   };
1189 
1190   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1191   /// instruction \p I and vector width \p VF.
1192   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1193                            unsigned Cost) {
1194     assert(VF.isVector() && "Expected VF >=2");
1195     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1196   }
1197 
1198   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1199   /// interleaving group \p Grp and vector width \p VF.
1200   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1201                            ElementCount VF, InstWidening W, unsigned Cost) {
1202     assert(VF.isVector() && "Expected VF >=2");
1203     /// Broadcast this decicion to all instructions inside the group.
1204     /// But the cost will be assigned to one instruction only.
1205     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1206       if (auto *I = Grp->getMember(i)) {
1207         if (Grp->getInsertPos() == I)
1208           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1209         else
1210           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1211       }
1212     }
1213   }
1214 
1215   /// Return the cost model decision for the given instruction \p I and vector
1216   /// width \p VF. Return CM_Unknown if this instruction did not pass
1217   /// through the cost modeling.
1218   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1219     assert(!VF.Scalable && "scalable vectors not yet supported.");
1220     assert(VF.isVector() && "Expected VF >=2");
1221 
1222     // Cost model is not run in the VPlan-native path - return conservative
1223     // result until this changes.
1224     if (EnableVPlanNativePath)
1225       return CM_GatherScatter;
1226 
1227     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1228     auto Itr = WideningDecisions.find(InstOnVF);
1229     if (Itr == WideningDecisions.end())
1230       return CM_Unknown;
1231     return Itr->second.first;
1232   }
1233 
1234   /// Return the vectorization cost for the given instruction \p I and vector
1235   /// width \p VF.
1236   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1237     assert(VF.isVector() && "Expected VF >=2");
1238     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1239     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1240            "The cost is not calculated");
1241     return WideningDecisions[InstOnVF].second;
1242   }
1243 
1244   /// Return True if instruction \p I is an optimizable truncate whose operand
1245   /// is an induction variable. Such a truncate will be removed by adding a new
1246   /// induction variable with the destination type.
1247   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1248     // If the instruction is not a truncate, return false.
1249     auto *Trunc = dyn_cast<TruncInst>(I);
1250     if (!Trunc)
1251       return false;
1252 
1253     // Get the source and destination types of the truncate.
1254     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1255     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1256 
1257     // If the truncate is free for the given types, return false. Replacing a
1258     // free truncate with an induction variable would add an induction variable
1259     // update instruction to each iteration of the loop. We exclude from this
1260     // check the primary induction variable since it will need an update
1261     // instruction regardless.
1262     Value *Op = Trunc->getOperand(0);
1263     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1264       return false;
1265 
1266     // If the truncated value is not an induction variable, return false.
1267     return Legal->isInductionPhi(Op);
1268   }
1269 
1270   /// Collects the instructions to scalarize for each predicated instruction in
1271   /// the loop.
1272   void collectInstsToScalarize(ElementCount VF);
1273 
1274   /// Collect Uniform and Scalar values for the given \p VF.
1275   /// The sets depend on CM decision for Load/Store instructions
1276   /// that may be vectorized as interleave, gather-scatter or scalarized.
1277   void collectUniformsAndScalars(ElementCount VF) {
1278     // Do the analysis once.
1279     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1280       return;
1281     setCostBasedWideningDecision(VF);
1282     collectLoopUniforms(VF);
1283     collectLoopScalars(VF);
1284   }
1285 
1286   /// Returns true if the target machine supports masked store operation
1287   /// for the given \p DataType and kind of access to \p Ptr.
1288   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1289     return Legal->isConsecutivePtr(Ptr) &&
1290            TTI.isLegalMaskedStore(DataType, Alignment);
1291   }
1292 
1293   /// Returns true if the target machine supports masked load operation
1294   /// for the given \p DataType and kind of access to \p Ptr.
1295   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1296     return Legal->isConsecutivePtr(Ptr) &&
1297            TTI.isLegalMaskedLoad(DataType, Alignment);
1298   }
1299 
1300   /// Returns true if the target machine supports masked scatter operation
1301   /// for the given \p DataType.
1302   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1303     return TTI.isLegalMaskedScatter(DataType, Alignment);
1304   }
1305 
1306   /// Returns true if the target machine supports masked gather operation
1307   /// for the given \p DataType.
1308   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1309     return TTI.isLegalMaskedGather(DataType, Alignment);
1310   }
1311 
1312   /// Returns true if the target machine can represent \p V as a masked gather
1313   /// or scatter operation.
1314   bool isLegalGatherOrScatter(Value *V) {
1315     bool LI = isa<LoadInst>(V);
1316     bool SI = isa<StoreInst>(V);
1317     if (!LI && !SI)
1318       return false;
1319     auto *Ty = getMemInstValueType(V);
1320     Align Align = getLoadStoreAlignment(V);
1321     return (LI && isLegalMaskedGather(Ty, Align)) ||
1322            (SI && isLegalMaskedScatter(Ty, Align));
1323   }
1324 
1325   /// Returns true if \p I is an instruction that will be scalarized with
1326   /// predication. Such instructions include conditional stores and
1327   /// instructions that may divide by zero.
1328   /// If a non-zero VF has been calculated, we check if I will be scalarized
1329   /// predication for that VF.
1330   bool isScalarWithPredication(Instruction *I,
1331                                ElementCount VF = ElementCount::getFixed(1));
1332 
1333   // Returns true if \p I is an instruction that will be predicated either
1334   // through scalar predication or masked load/store or masked gather/scatter.
1335   // Superset of instructions that return true for isScalarWithPredication.
1336   bool isPredicatedInst(Instruction *I) {
1337     if (!blockNeedsPredication(I->getParent()))
1338       return false;
1339     // Loads and stores that need some form of masked operation are predicated
1340     // instructions.
1341     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1342       return Legal->isMaskRequired(I);
1343     return isScalarWithPredication(I);
1344   }
1345 
1346   /// Returns true if \p I is a memory instruction with consecutive memory
1347   /// access that can be widened.
1348   bool
1349   memoryInstructionCanBeWidened(Instruction *I,
1350                                 ElementCount VF = ElementCount::getFixed(1));
1351 
1352   /// Returns true if \p I is a memory instruction in an interleaved-group
1353   /// of memory accesses that can be vectorized with wide vector loads/stores
1354   /// and shuffles.
1355   bool
1356   interleavedAccessCanBeWidened(Instruction *I,
1357                                 ElementCount VF = ElementCount::getFixed(1));
1358 
1359   /// Check if \p Instr belongs to any interleaved access group.
1360   bool isAccessInterleaved(Instruction *Instr) {
1361     return InterleaveInfo.isInterleaved(Instr);
1362   }
1363 
1364   /// Get the interleaved access group that \p Instr belongs to.
1365   const InterleaveGroup<Instruction> *
1366   getInterleavedAccessGroup(Instruction *Instr) {
1367     return InterleaveInfo.getInterleaveGroup(Instr);
1368   }
1369 
1370   /// Returns true if an interleaved group requires a scalar iteration
1371   /// to handle accesses with gaps, and there is nothing preventing us from
1372   /// creating a scalar epilogue.
1373   bool requiresScalarEpilogue() const {
1374     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1375   }
1376 
1377   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1378   /// loop hint annotation.
1379   bool isScalarEpilogueAllowed() const {
1380     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1381   }
1382 
1383   /// Returns true if all loop blocks should be masked to fold tail loop.
1384   bool foldTailByMasking() const { return FoldTailByMasking; }
1385 
1386   bool blockNeedsPredication(BasicBlock *BB) {
1387     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1388   }
1389 
1390   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1391   /// nodes to the chain of instructions representing the reductions. Uses a
1392   /// MapVector to ensure deterministic iteration order.
1393   using ReductionChainMap =
1394       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1395 
1396   /// Return the chain of instructions representing an inloop reduction.
1397   const ReductionChainMap &getInLoopReductionChains() const {
1398     return InLoopReductionChains;
1399   }
1400 
1401   /// Returns true if the Phi is part of an inloop reduction.
1402   bool isInLoopReduction(PHINode *Phi) const {
1403     return InLoopReductionChains.count(Phi);
1404   }
1405 
1406   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1407   /// with factor VF.  Return the cost of the instruction, including
1408   /// scalarization overhead if it's needed.
1409   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1410 
1411   /// Estimate cost of a call instruction CI if it were vectorized with factor
1412   /// VF. Return the cost of the instruction, including scalarization overhead
1413   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1414   /// scalarized -
1415   /// i.e. either vector version isn't available, or is too expensive.
1416   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1417                              bool &NeedToScalarize);
1418 
1419   /// Invalidates decisions already taken by the cost model.
1420   void invalidateCostModelingDecisions() {
1421     WideningDecisions.clear();
1422     Uniforms.clear();
1423     Scalars.clear();
1424   }
1425 
1426 private:
1427   unsigned NumPredStores = 0;
1428 
1429   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1430   /// than zero. One is returned if vectorization should best be avoided due
1431   /// to cost.
1432   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1433 
1434   /// The vectorization cost is a combination of the cost itself and a boolean
1435   /// indicating whether any of the contributing operations will actually
1436   /// operate on
1437   /// vector values after type legalization in the backend. If this latter value
1438   /// is
1439   /// false, then all operations will be scalarized (i.e. no vectorization has
1440   /// actually taken place).
1441   using VectorizationCostTy = std::pair<unsigned, bool>;
1442 
1443   /// Returns the expected execution cost. The unit of the cost does
1444   /// not matter because we use the 'cost' units to compare different
1445   /// vector widths. The cost that is returned is *not* normalized by
1446   /// the factor width.
1447   VectorizationCostTy expectedCost(ElementCount VF);
1448 
1449   /// Returns the execution time cost of an instruction for a given vector
1450   /// width. Vector width of one means scalar.
1451   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1452 
1453   /// The cost-computation logic from getInstructionCost which provides
1454   /// the vector type as an output parameter.
1455   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1456 
1457   /// Calculate vectorization cost of memory instruction \p I.
1458   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1459 
1460   /// The cost computation for scalarized memory instruction.
1461   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1462 
1463   /// The cost computation for interleaving group of memory instructions.
1464   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1465 
1466   /// The cost computation for Gather/Scatter instruction.
1467   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1468 
1469   /// The cost computation for widening instruction \p I with consecutive
1470   /// memory access.
1471   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1472 
1473   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1474   /// Load: scalar load + broadcast.
1475   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1476   /// element)
1477   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1478 
1479   /// Estimate the overhead of scalarizing an instruction. This is a
1480   /// convenience wrapper for the type-based getScalarizationOverhead API.
1481   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1482 
1483   /// Returns whether the instruction is a load or store and will be a emitted
1484   /// as a vector operation.
1485   bool isConsecutiveLoadOrStore(Instruction *I);
1486 
1487   /// Returns true if an artificially high cost for emulated masked memrefs
1488   /// should be used.
1489   bool useEmulatedMaskMemRefHack(Instruction *I);
1490 
1491   /// Map of scalar integer values to the smallest bitwidth they can be legally
1492   /// represented as. The vector equivalents of these values should be truncated
1493   /// to this type.
1494   MapVector<Instruction *, uint64_t> MinBWs;
1495 
1496   /// A type representing the costs for instructions if they were to be
1497   /// scalarized rather than vectorized. The entries are Instruction-Cost
1498   /// pairs.
1499   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1500 
1501   /// A set containing all BasicBlocks that are known to present after
1502   /// vectorization as a predicated block.
1503   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1504 
1505   /// Records whether it is allowed to have the original scalar loop execute at
1506   /// least once. This may be needed as a fallback loop in case runtime
1507   /// aliasing/dependence checks fail, or to handle the tail/remainder
1508   /// iterations when the trip count is unknown or doesn't divide by the VF,
1509   /// or as a peel-loop to handle gaps in interleave-groups.
1510   /// Under optsize and when the trip count is very small we don't allow any
1511   /// iterations to execute in the scalar loop.
1512   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1513 
1514   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1515   bool FoldTailByMasking = false;
1516 
1517   /// A map holding scalar costs for different vectorization factors. The
1518   /// presence of a cost for an instruction in the mapping indicates that the
1519   /// instruction will be scalarized when vectorizing with the associated
1520   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1521   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1522 
1523   /// Holds the instructions known to be uniform after vectorization.
1524   /// The data is collected per VF.
1525   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1526 
1527   /// Holds the instructions known to be scalar after vectorization.
1528   /// The data is collected per VF.
1529   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1530 
1531   /// Holds the instructions (address computations) that are forced to be
1532   /// scalarized.
1533   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1534 
1535   /// PHINodes of the reductions that should be expanded in-loop along with
1536   /// their associated chains of reduction operations, in program order from top
1537   /// (PHI) to bottom
1538   ReductionChainMap InLoopReductionChains;
1539 
1540   /// Returns the expected difference in cost from scalarizing the expression
1541   /// feeding a predicated instruction \p PredInst. The instructions to
1542   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1543   /// non-negative return value implies the expression will be scalarized.
1544   /// Currently, only single-use chains are considered for scalarization.
1545   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1546                               ElementCount VF);
1547 
1548   /// Collect the instructions that are uniform after vectorization. An
1549   /// instruction is uniform if we represent it with a single scalar value in
1550   /// the vectorized loop corresponding to each vector iteration. Examples of
1551   /// uniform instructions include pointer operands of consecutive or
1552   /// interleaved memory accesses. Note that although uniformity implies an
1553   /// instruction will be scalar, the reverse is not true. In general, a
1554   /// scalarized instruction will be represented by VF scalar values in the
1555   /// vectorized loop, each corresponding to an iteration of the original
1556   /// scalar loop.
1557   void collectLoopUniforms(ElementCount VF);
1558 
1559   /// Collect the instructions that are scalar after vectorization. An
1560   /// instruction is scalar if it is known to be uniform or will be scalarized
1561   /// during vectorization. Non-uniform scalarized instructions will be
1562   /// represented by VF values in the vectorized loop, each corresponding to an
1563   /// iteration of the original scalar loop.
1564   void collectLoopScalars(ElementCount VF);
1565 
1566   /// Keeps cost model vectorization decision and cost for instructions.
1567   /// Right now it is used for memory instructions only.
1568   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1569                                 std::pair<InstWidening, unsigned>>;
1570 
1571   DecisionList WideningDecisions;
1572 
1573   /// Returns true if \p V is expected to be vectorized and it needs to be
1574   /// extracted.
1575   bool needsExtract(Value *V, ElementCount VF) const {
1576     Instruction *I = dyn_cast<Instruction>(V);
1577     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1578         TheLoop->isLoopInvariant(I))
1579       return false;
1580 
1581     // Assume we can vectorize V (and hence we need extraction) if the
1582     // scalars are not computed yet. This can happen, because it is called
1583     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1584     // the scalars are collected. That should be a safe assumption in most
1585     // cases, because we check if the operands have vectorizable types
1586     // beforehand in LoopVectorizationLegality.
1587     return Scalars.find(VF) == Scalars.end() ||
1588            !isScalarAfterVectorization(I, VF);
1589   };
1590 
1591   /// Returns a range containing only operands needing to be extracted.
1592   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1593                                                    ElementCount VF) {
1594     return SmallVector<Value *, 4>(make_filter_range(
1595         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1596   }
1597 
1598 public:
1599   /// The loop that we evaluate.
1600   Loop *TheLoop;
1601 
1602   /// Predicated scalar evolution analysis.
1603   PredicatedScalarEvolution &PSE;
1604 
1605   /// Loop Info analysis.
1606   LoopInfo *LI;
1607 
1608   /// Vectorization legality.
1609   LoopVectorizationLegality *Legal;
1610 
1611   /// Vector target information.
1612   const TargetTransformInfo &TTI;
1613 
1614   /// Target Library Info.
1615   const TargetLibraryInfo *TLI;
1616 
1617   /// Demanded bits analysis.
1618   DemandedBits *DB;
1619 
1620   /// Assumption cache.
1621   AssumptionCache *AC;
1622 
1623   /// Interface to emit optimization remarks.
1624   OptimizationRemarkEmitter *ORE;
1625 
1626   const Function *TheFunction;
1627 
1628   /// Loop Vectorize Hint.
1629   const LoopVectorizeHints *Hints;
1630 
1631   /// The interleave access information contains groups of interleaved accesses
1632   /// with the same stride and close to each other.
1633   InterleavedAccessInfo &InterleaveInfo;
1634 
1635   /// Values to ignore in the cost model.
1636   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1637 
1638   /// Values to ignore in the cost model when VF > 1.
1639   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1640 };
1641 
1642 } // end namespace llvm
1643 
1644 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1645 // vectorization. The loop needs to be annotated with #pragma omp simd
1646 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1647 // vector length information is not provided, vectorization is not considered
1648 // explicit. Interleave hints are not allowed either. These limitations will be
1649 // relaxed in the future.
1650 // Please, note that we are currently forced to abuse the pragma 'clang
1651 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1652 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1653 // provides *explicit vectorization hints* (LV can bypass legal checks and
1654 // assume that vectorization is legal). However, both hints are implemented
1655 // using the same metadata (llvm.loop.vectorize, processed by
1656 // LoopVectorizeHints). This will be fixed in the future when the native IR
1657 // representation for pragma 'omp simd' is introduced.
1658 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1659                                    OptimizationRemarkEmitter *ORE) {
1660   assert(!OuterLp->empty() && "This is not an outer loop");
1661   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1662 
1663   // Only outer loops with an explicit vectorization hint are supported.
1664   // Unannotated outer loops are ignored.
1665   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1666     return false;
1667 
1668   Function *Fn = OuterLp->getHeader()->getParent();
1669   if (!Hints.allowVectorization(Fn, OuterLp,
1670                                 true /*VectorizeOnlyWhenForced*/)) {
1671     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1672     return false;
1673   }
1674 
1675   if (Hints.getInterleave() > 1) {
1676     // TODO: Interleave support is future work.
1677     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1678                          "outer loops.\n");
1679     Hints.emitRemarkWithHints();
1680     return false;
1681   }
1682 
1683   return true;
1684 }
1685 
1686 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1687                                   OptimizationRemarkEmitter *ORE,
1688                                   SmallVectorImpl<Loop *> &V) {
1689   // Collect inner loops and outer loops without irreducible control flow. For
1690   // now, only collect outer loops that have explicit vectorization hints. If we
1691   // are stress testing the VPlan H-CFG construction, we collect the outermost
1692   // loop of every loop nest.
1693   if (L.empty() || VPlanBuildStressTest ||
1694       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1695     LoopBlocksRPO RPOT(&L);
1696     RPOT.perform(LI);
1697     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1698       V.push_back(&L);
1699       // TODO: Collect inner loops inside marked outer loops in case
1700       // vectorization fails for the outer loop. Do not invoke
1701       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1702       // already known to be reducible. We can use an inherited attribute for
1703       // that.
1704       return;
1705     }
1706   }
1707   for (Loop *InnerL : L)
1708     collectSupportedLoops(*InnerL, LI, ORE, V);
1709 }
1710 
1711 namespace {
1712 
1713 /// The LoopVectorize Pass.
1714 struct LoopVectorize : public FunctionPass {
1715   /// Pass identification, replacement for typeid
1716   static char ID;
1717 
1718   LoopVectorizePass Impl;
1719 
1720   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1721                          bool VectorizeOnlyWhenForced = false)
1722       : FunctionPass(ID),
1723         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1724     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1725   }
1726 
1727   bool runOnFunction(Function &F) override {
1728     if (skipFunction(F))
1729       return false;
1730 
1731     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1732     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1733     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1734     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1735     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1736     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1737     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1738     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1739     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1740     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1741     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1742     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1743     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1744 
1745     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1746         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1747 
1748     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1749                         GetLAA, *ORE, PSI).MadeAnyChange;
1750   }
1751 
1752   void getAnalysisUsage(AnalysisUsage &AU) const override {
1753     AU.addRequired<AssumptionCacheTracker>();
1754     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1755     AU.addRequired<DominatorTreeWrapperPass>();
1756     AU.addRequired<LoopInfoWrapperPass>();
1757     AU.addRequired<ScalarEvolutionWrapperPass>();
1758     AU.addRequired<TargetTransformInfoWrapperPass>();
1759     AU.addRequired<AAResultsWrapperPass>();
1760     AU.addRequired<LoopAccessLegacyAnalysis>();
1761     AU.addRequired<DemandedBitsWrapperPass>();
1762     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1763     AU.addRequired<InjectTLIMappingsLegacy>();
1764 
1765     // We currently do not preserve loopinfo/dominator analyses with outer loop
1766     // vectorization. Until this is addressed, mark these analyses as preserved
1767     // only for non-VPlan-native path.
1768     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1769     if (!EnableVPlanNativePath) {
1770       AU.addPreserved<LoopInfoWrapperPass>();
1771       AU.addPreserved<DominatorTreeWrapperPass>();
1772     }
1773 
1774     AU.addPreserved<BasicAAWrapperPass>();
1775     AU.addPreserved<GlobalsAAWrapperPass>();
1776     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1777   }
1778 };
1779 
1780 } // end anonymous namespace
1781 
1782 //===----------------------------------------------------------------------===//
1783 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1784 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1785 //===----------------------------------------------------------------------===//
1786 
1787 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1788   // We need to place the broadcast of invariant variables outside the loop,
1789   // but only if it's proven safe to do so. Else, broadcast will be inside
1790   // vector loop body.
1791   Instruction *Instr = dyn_cast<Instruction>(V);
1792   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1793                      (!Instr ||
1794                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1795   // Place the code for broadcasting invariant variables in the new preheader.
1796   IRBuilder<>::InsertPointGuard Guard(Builder);
1797   if (SafeToHoist)
1798     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1799 
1800   // Broadcast the scalar into all locations in the vector.
1801   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1802 
1803   return Shuf;
1804 }
1805 
1806 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1807     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1808   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1809          "Expected either an induction phi-node or a truncate of it!");
1810   Value *Start = II.getStartValue();
1811 
1812   // Construct the initial value of the vector IV in the vector loop preheader
1813   auto CurrIP = Builder.saveIP();
1814   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1815   if (isa<TruncInst>(EntryVal)) {
1816     assert(Start->getType()->isIntegerTy() &&
1817            "Truncation requires an integer type");
1818     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1819     Step = Builder.CreateTrunc(Step, TruncType);
1820     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1821   }
1822   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1823   Value *SteppedStart =
1824       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1825 
1826   // We create vector phi nodes for both integer and floating-point induction
1827   // variables. Here, we determine the kind of arithmetic we will perform.
1828   Instruction::BinaryOps AddOp;
1829   Instruction::BinaryOps MulOp;
1830   if (Step->getType()->isIntegerTy()) {
1831     AddOp = Instruction::Add;
1832     MulOp = Instruction::Mul;
1833   } else {
1834     AddOp = II.getInductionOpcode();
1835     MulOp = Instruction::FMul;
1836   }
1837 
1838   // Multiply the vectorization factor by the step using integer or
1839   // floating-point arithmetic as appropriate.
1840   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF.Min);
1841   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1842 
1843   // Create a vector splat to use in the induction update.
1844   //
1845   // FIXME: If the step is non-constant, we create the vector splat with
1846   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1847   //        handle a constant vector splat.
1848   assert(!VF.Scalable && "scalable vectors not yet supported.");
1849   Value *SplatVF = isa<Constant>(Mul)
1850                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1851                        : Builder.CreateVectorSplat(VF, Mul);
1852   Builder.restoreIP(CurrIP);
1853 
1854   // We may need to add the step a number of times, depending on the unroll
1855   // factor. The last of those goes into the PHI.
1856   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1857                                     &*LoopVectorBody->getFirstInsertionPt());
1858   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1859   Instruction *LastInduction = VecInd;
1860   for (unsigned Part = 0; Part < UF; ++Part) {
1861     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1862 
1863     if (isa<TruncInst>(EntryVal))
1864       addMetadata(LastInduction, EntryVal);
1865     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1866 
1867     LastInduction = cast<Instruction>(addFastMathFlag(
1868         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1869     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1870   }
1871 
1872   // Move the last step to the end of the latch block. This ensures consistent
1873   // placement of all induction updates.
1874   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1875   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1876   auto *ICmp = cast<Instruction>(Br->getCondition());
1877   LastInduction->moveBefore(ICmp);
1878   LastInduction->setName("vec.ind.next");
1879 
1880   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1881   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1882 }
1883 
1884 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1885   return Cost->isScalarAfterVectorization(I, VF) ||
1886          Cost->isProfitableToScalarize(I, VF);
1887 }
1888 
1889 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1890   if (shouldScalarizeInstruction(IV))
1891     return true;
1892   auto isScalarInst = [&](User *U) -> bool {
1893     auto *I = cast<Instruction>(U);
1894     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1895   };
1896   return llvm::any_of(IV->users(), isScalarInst);
1897 }
1898 
1899 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1900     const InductionDescriptor &ID, const Instruction *EntryVal,
1901     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1902   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1903          "Expected either an induction phi-node or a truncate of it!");
1904 
1905   // This induction variable is not the phi from the original loop but the
1906   // newly-created IV based on the proof that casted Phi is equal to the
1907   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1908   // re-uses the same InductionDescriptor that original IV uses but we don't
1909   // have to do any recording in this case - that is done when original IV is
1910   // processed.
1911   if (isa<TruncInst>(EntryVal))
1912     return;
1913 
1914   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1915   if (Casts.empty())
1916     return;
1917   // Only the first Cast instruction in the Casts vector is of interest.
1918   // The rest of the Casts (if exist) have no uses outside the
1919   // induction update chain itself.
1920   Instruction *CastInst = *Casts.begin();
1921   if (Lane < UINT_MAX)
1922     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1923   else
1924     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1925 }
1926 
1927 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1928   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1929          "Primary induction variable must have an integer type");
1930 
1931   auto II = Legal->getInductionVars().find(IV);
1932   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1933 
1934   auto ID = II->second;
1935   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1936 
1937   // The value from the original loop to which we are mapping the new induction
1938   // variable.
1939   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1940 
1941   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1942 
1943   // Generate code for the induction step. Note that induction steps are
1944   // required to be loop-invariant
1945   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1946     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1947            "Induction step should be loop invariant");
1948     if (PSE.getSE()->isSCEVable(IV->getType())) {
1949       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1950       return Exp.expandCodeFor(Step, Step->getType(),
1951                                LoopVectorPreHeader->getTerminator());
1952     }
1953     return cast<SCEVUnknown>(Step)->getValue();
1954   };
1955 
1956   // The scalar value to broadcast. This is derived from the canonical
1957   // induction variable. If a truncation type is given, truncate the canonical
1958   // induction variable and step. Otherwise, derive these values from the
1959   // induction descriptor.
1960   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1961     Value *ScalarIV = Induction;
1962     if (IV != OldInduction) {
1963       ScalarIV = IV->getType()->isIntegerTy()
1964                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1965                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1966                                           IV->getType());
1967       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1968       ScalarIV->setName("offset.idx");
1969     }
1970     if (Trunc) {
1971       auto *TruncType = cast<IntegerType>(Trunc->getType());
1972       assert(Step->getType()->isIntegerTy() &&
1973              "Truncation requires an integer step");
1974       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1975       Step = Builder.CreateTrunc(Step, TruncType);
1976     }
1977     return ScalarIV;
1978   };
1979 
1980   // Create the vector values from the scalar IV, in the absence of creating a
1981   // vector IV.
1982   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1983     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1984     for (unsigned Part = 0; Part < UF; ++Part) {
1985       assert(!VF.Scalable && "scalable vectors not yet supported.");
1986       Value *EntryPart = getStepVector(Broadcasted, VF.Min * Part, Step,
1987                                        ID.getInductionOpcode());
1988       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1989       if (Trunc)
1990         addMetadata(EntryPart, Trunc);
1991       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1992     }
1993   };
1994 
1995   // Now do the actual transformations, and start with creating the step value.
1996   Value *Step = CreateStepValue(ID.getStep());
1997   if (VF.isZero() || VF.isScalar()) {
1998     Value *ScalarIV = CreateScalarIV(Step);
1999     CreateSplatIV(ScalarIV, Step);
2000     return;
2001   }
2002 
2003   // Determine if we want a scalar version of the induction variable. This is
2004   // true if the induction variable itself is not widened, or if it has at
2005   // least one user in the loop that is not widened.
2006   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2007   if (!NeedsScalarIV) {
2008     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2009     return;
2010   }
2011 
2012   // Try to create a new independent vector induction variable. If we can't
2013   // create the phi node, we will splat the scalar induction variable in each
2014   // loop iteration.
2015   if (!shouldScalarizeInstruction(EntryVal)) {
2016     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2017     Value *ScalarIV = CreateScalarIV(Step);
2018     // Create scalar steps that can be used by instructions we will later
2019     // scalarize. Note that the addition of the scalar steps will not increase
2020     // the number of instructions in the loop in the common case prior to
2021     // InstCombine. We will be trading one vector extract for each scalar step.
2022     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2023     return;
2024   }
2025 
2026   // All IV users are scalar instructions, so only emit a scalar IV, not a
2027   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2028   // predicate used by the masked loads/stores.
2029   Value *ScalarIV = CreateScalarIV(Step);
2030   if (!Cost->isScalarEpilogueAllowed())
2031     CreateSplatIV(ScalarIV, Step);
2032   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2033 }
2034 
2035 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2036                                           Instruction::BinaryOps BinOp) {
2037   // Create and check the types.
2038   auto *ValVTy = cast<VectorType>(Val->getType());
2039   int VLen = ValVTy->getNumElements();
2040 
2041   Type *STy = Val->getType()->getScalarType();
2042   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2043          "Induction Step must be an integer or FP");
2044   assert(Step->getType() == STy && "Step has wrong type");
2045 
2046   SmallVector<Constant *, 8> Indices;
2047 
2048   if (STy->isIntegerTy()) {
2049     // Create a vector of consecutive numbers from zero to VF.
2050     for (int i = 0; i < VLen; ++i)
2051       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2052 
2053     // Add the consecutive indices to the vector value.
2054     Constant *Cv = ConstantVector::get(Indices);
2055     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2056     Step = Builder.CreateVectorSplat(VLen, Step);
2057     assert(Step->getType() == Val->getType() && "Invalid step vec");
2058     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2059     // which can be found from the original scalar operations.
2060     Step = Builder.CreateMul(Cv, Step);
2061     return Builder.CreateAdd(Val, Step, "induction");
2062   }
2063 
2064   // Floating point induction.
2065   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2066          "Binary Opcode should be specified for FP induction");
2067   // Create a vector of consecutive numbers from zero to VF.
2068   for (int i = 0; i < VLen; ++i)
2069     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2070 
2071   // Add the consecutive indices to the vector value.
2072   Constant *Cv = ConstantVector::get(Indices);
2073 
2074   Step = Builder.CreateVectorSplat(VLen, Step);
2075 
2076   // Floating point operations had to be 'fast' to enable the induction.
2077   FastMathFlags Flags;
2078   Flags.setFast();
2079 
2080   Value *MulOp = Builder.CreateFMul(Cv, Step);
2081   if (isa<Instruction>(MulOp))
2082     // Have to check, MulOp may be a constant
2083     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2084 
2085   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2086   if (isa<Instruction>(BOp))
2087     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2088   return BOp;
2089 }
2090 
2091 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2092                                            Instruction *EntryVal,
2093                                            const InductionDescriptor &ID) {
2094   // We shouldn't have to build scalar steps if we aren't vectorizing.
2095   assert(VF.isVector() && "VF should be greater than one");
2096   assert(!VF.Scalable &&
2097          "the code below assumes a fixed number of elements at compile time");
2098   // Get the value type and ensure it and the step have the same integer type.
2099   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2100   assert(ScalarIVTy == Step->getType() &&
2101          "Val and Step should have the same type");
2102 
2103   // We build scalar steps for both integer and floating-point induction
2104   // variables. Here, we determine the kind of arithmetic we will perform.
2105   Instruction::BinaryOps AddOp;
2106   Instruction::BinaryOps MulOp;
2107   if (ScalarIVTy->isIntegerTy()) {
2108     AddOp = Instruction::Add;
2109     MulOp = Instruction::Mul;
2110   } else {
2111     AddOp = ID.getInductionOpcode();
2112     MulOp = Instruction::FMul;
2113   }
2114 
2115   // Determine the number of scalars we need to generate for each unroll
2116   // iteration. If EntryVal is uniform, we only need to generate the first
2117   // lane. Otherwise, we generate all VF values.
2118   unsigned Lanes =
2119       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2120           ? 1
2121           : VF.Min;
2122   // Compute the scalar steps and save the results in VectorLoopValueMap.
2123   for (unsigned Part = 0; Part < UF; ++Part) {
2124     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2125       auto *StartIdx =
2126           getSignedIntOrFpConstant(ScalarIVTy, VF.Min * Part + Lane);
2127       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2128       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2129       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2130       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2131     }
2132   }
2133 }
2134 
2135 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2136   assert(V != Induction && "The new induction variable should not be used.");
2137   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2138   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2139 
2140   // If we have a stride that is replaced by one, do it here. Defer this for
2141   // the VPlan-native path until we start running Legal checks in that path.
2142   if (!EnableVPlanNativePath && Legal->hasStride(V))
2143     V = ConstantInt::get(V->getType(), 1);
2144 
2145   // If we have a vector mapped to this value, return it.
2146   if (VectorLoopValueMap.hasVectorValue(V, Part))
2147     return VectorLoopValueMap.getVectorValue(V, Part);
2148 
2149   // If the value has not been vectorized, check if it has been scalarized
2150   // instead. If it has been scalarized, and we actually need the value in
2151   // vector form, we will construct the vector values on demand.
2152   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2153     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2154 
2155     // If we've scalarized a value, that value should be an instruction.
2156     auto *I = cast<Instruction>(V);
2157 
2158     // If we aren't vectorizing, we can just copy the scalar map values over to
2159     // the vector map.
2160     if (VF == 1) {
2161       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2162       return ScalarValue;
2163     }
2164 
2165     // Get the last scalar instruction we generated for V and Part. If the value
2166     // is known to be uniform after vectorization, this corresponds to lane zero
2167     // of the Part unroll iteration. Otherwise, the last instruction is the one
2168     // we created for the last vector lane of the Part unroll iteration.
2169     assert(!VF.Scalable && "scalable vectors not yet supported.");
2170     unsigned LastLane =
2171         Cost->isUniformAfterVectorization(I, VF) ? 0 : VF.Min - 1;
2172     auto *LastInst = cast<Instruction>(
2173         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2174 
2175     // Set the insert point after the last scalarized instruction. This ensures
2176     // the insertelement sequence will directly follow the scalar definitions.
2177     auto OldIP = Builder.saveIP();
2178     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2179     Builder.SetInsertPoint(&*NewIP);
2180 
2181     // However, if we are vectorizing, we need to construct the vector values.
2182     // If the value is known to be uniform after vectorization, we can just
2183     // broadcast the scalar value corresponding to lane zero for each unroll
2184     // iteration. Otherwise, we construct the vector values using insertelement
2185     // instructions. Since the resulting vectors are stored in
2186     // VectorLoopValueMap, we will only generate the insertelements once.
2187     Value *VectorValue = nullptr;
2188     if (Cost->isUniformAfterVectorization(I, VF)) {
2189       VectorValue = getBroadcastInstrs(ScalarValue);
2190       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2191     } else {
2192       // Initialize packing with insertelements to start from undef.
2193       assert(!VF.Scalable && "VF is assumed to be non scalable.");
2194       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2195       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2196       for (unsigned Lane = 0; Lane < VF.Min; ++Lane)
2197         packScalarIntoVectorValue(V, {Part, Lane});
2198       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2199     }
2200     Builder.restoreIP(OldIP);
2201     return VectorValue;
2202   }
2203 
2204   // If this scalar is unknown, assume that it is a constant or that it is
2205   // loop invariant. Broadcast V and save the value for future uses.
2206   Value *B = getBroadcastInstrs(V);
2207   VectorLoopValueMap.setVectorValue(V, Part, B);
2208   return B;
2209 }
2210 
2211 Value *
2212 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2213                                             const VPIteration &Instance) {
2214   // If the value is not an instruction contained in the loop, it should
2215   // already be scalar.
2216   if (OrigLoop->isLoopInvariant(V))
2217     return V;
2218 
2219   assert(Instance.Lane > 0
2220              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2221              : true && "Uniform values only have lane zero");
2222 
2223   // If the value from the original loop has not been vectorized, it is
2224   // represented by UF x VF scalar values in the new loop. Return the requested
2225   // scalar value.
2226   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2227     return VectorLoopValueMap.getScalarValue(V, Instance);
2228 
2229   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2230   // for the given unroll part. If this entry is not a vector type (i.e., the
2231   // vectorization factor is one), there is no need to generate an
2232   // extractelement instruction.
2233   auto *U = getOrCreateVectorValue(V, Instance.Part);
2234   if (!U->getType()->isVectorTy()) {
2235     assert(VF == 1 && "Value not scalarized has non-vector type");
2236     return U;
2237   }
2238 
2239   // Otherwise, the value from the original loop has been vectorized and is
2240   // represented by UF vector values. Extract and return the requested scalar
2241   // value from the appropriate vector lane.
2242   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2243 }
2244 
2245 void InnerLoopVectorizer::packScalarIntoVectorValue(
2246     Value *V, const VPIteration &Instance) {
2247   assert(V != Induction && "The new induction variable should not be used.");
2248   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2249   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2250 
2251   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2252   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2253   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2254                                             Builder.getInt32(Instance.Lane));
2255   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2256 }
2257 
2258 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2259   assert(Vec->getType()->isVectorTy() && "Invalid type");
2260   assert(!VF.Scalable && "Cannot reverse scalable vectors");
2261   SmallVector<int, 8> ShuffleMask;
2262   for (unsigned i = 0; i < VF.Min; ++i)
2263     ShuffleMask.push_back(VF.Min - i - 1);
2264 
2265   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2266                                      ShuffleMask, "reverse");
2267 }
2268 
2269 // Return whether we allow using masked interleave-groups (for dealing with
2270 // strided loads/stores that reside in predicated blocks, or for dealing
2271 // with gaps).
2272 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2273   // If an override option has been passed in for interleaved accesses, use it.
2274   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2275     return EnableMaskedInterleavedMemAccesses;
2276 
2277   return TTI.enableMaskedInterleavedAccessVectorization();
2278 }
2279 
2280 // Try to vectorize the interleave group that \p Instr belongs to.
2281 //
2282 // E.g. Translate following interleaved load group (factor = 3):
2283 //   for (i = 0; i < N; i+=3) {
2284 //     R = Pic[i];             // Member of index 0
2285 //     G = Pic[i+1];           // Member of index 1
2286 //     B = Pic[i+2];           // Member of index 2
2287 //     ... // do something to R, G, B
2288 //   }
2289 // To:
2290 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2291 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2292 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2293 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2294 //
2295 // Or translate following interleaved store group (factor = 3):
2296 //   for (i = 0; i < N; i+=3) {
2297 //     ... do something to R, G, B
2298 //     Pic[i]   = R;           // Member of index 0
2299 //     Pic[i+1] = G;           // Member of index 1
2300 //     Pic[i+2] = B;           // Member of index 2
2301 //   }
2302 // To:
2303 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2304 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2305 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2306 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2307 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2308 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2309     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2310     VPValue *Addr, VPValue *BlockInMask) {
2311   Instruction *Instr = Group->getInsertPos();
2312   const DataLayout &DL = Instr->getModule()->getDataLayout();
2313 
2314   // Prepare for the vector type of the interleaved load/store.
2315   Type *ScalarTy = getMemInstValueType(Instr);
2316   unsigned InterleaveFactor = Group->getFactor();
2317   assert(!VF.Scalable && "scalable vectors not yet supported.");
2318   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2319 
2320   // Prepare for the new pointers.
2321   SmallVector<Value *, 2> AddrParts;
2322   unsigned Index = Group->getIndex(Instr);
2323 
2324   // TODO: extend the masked interleaved-group support to reversed access.
2325   assert((!BlockInMask || !Group->isReverse()) &&
2326          "Reversed masked interleave-group not supported.");
2327 
2328   // If the group is reverse, adjust the index to refer to the last vector lane
2329   // instead of the first. We adjust the index from the first vector lane,
2330   // rather than directly getting the pointer for lane VF - 1, because the
2331   // pointer operand of the interleaved access is supposed to be uniform. For
2332   // uniform instructions, we're only required to generate a value for the
2333   // first vector lane in each unroll iteration.
2334   assert(!VF.Scalable &&
2335          "scalable vector reverse operation is not implemented");
2336   if (Group->isReverse())
2337     Index += (VF.Min - 1) * Group->getFactor();
2338 
2339   for (unsigned Part = 0; Part < UF; Part++) {
2340     Value *AddrPart = State.get(Addr, {Part, 0});
2341     setDebugLocFromInst(Builder, AddrPart);
2342 
2343     // Notice current instruction could be any index. Need to adjust the address
2344     // to the member of index 0.
2345     //
2346     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2347     //       b = A[i];       // Member of index 0
2348     // Current pointer is pointed to A[i+1], adjust it to A[i].
2349     //
2350     // E.g.  A[i+1] = a;     // Member of index 1
2351     //       A[i]   = b;     // Member of index 0
2352     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2353     // Current pointer is pointed to A[i+2], adjust it to A[i].
2354 
2355     bool InBounds = false;
2356     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2357       InBounds = gep->isInBounds();
2358     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2359     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2360 
2361     // Cast to the vector pointer type.
2362     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2363     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2364     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2365   }
2366 
2367   setDebugLocFromInst(Builder, Instr);
2368   Value *UndefVec = UndefValue::get(VecTy);
2369 
2370   Value *MaskForGaps = nullptr;
2371   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2372     assert(!VF.Scalable && "scalable vectors not yet supported.");
2373     MaskForGaps = createBitMaskForGaps(Builder, VF.Min, *Group);
2374     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2375   }
2376 
2377   // Vectorize the interleaved load group.
2378   if (isa<LoadInst>(Instr)) {
2379     // For each unroll part, create a wide load for the group.
2380     SmallVector<Value *, 2> NewLoads;
2381     for (unsigned Part = 0; Part < UF; Part++) {
2382       Instruction *NewLoad;
2383       if (BlockInMask || MaskForGaps) {
2384         assert(useMaskedInterleavedAccesses(*TTI) &&
2385                "masked interleaved groups are not allowed.");
2386         Value *GroupMask = MaskForGaps;
2387         if (BlockInMask) {
2388           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2389           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2390           assert(!VF.Scalable && "scalable vectors not yet supported.");
2391           Value *ShuffledMask = Builder.CreateShuffleVector(
2392               BlockInMaskPart, Undefs,
2393               createReplicatedMask(InterleaveFactor, VF.Min),
2394               "interleaved.mask");
2395           GroupMask = MaskForGaps
2396                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2397                                                 MaskForGaps)
2398                           : ShuffledMask;
2399         }
2400         NewLoad =
2401             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2402                                      GroupMask, UndefVec, "wide.masked.vec");
2403       }
2404       else
2405         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2406                                             Group->getAlign(), "wide.vec");
2407       Group->addMetadata(NewLoad);
2408       NewLoads.push_back(NewLoad);
2409     }
2410 
2411     // For each member in the group, shuffle out the appropriate data from the
2412     // wide loads.
2413     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2414       Instruction *Member = Group->getMember(I);
2415 
2416       // Skip the gaps in the group.
2417       if (!Member)
2418         continue;
2419 
2420       assert(!VF.Scalable && "scalable vectors not yet supported.");
2421       auto StrideMask = createStrideMask(I, InterleaveFactor, VF.Min);
2422       for (unsigned Part = 0; Part < UF; Part++) {
2423         Value *StridedVec = Builder.CreateShuffleVector(
2424             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2425 
2426         // If this member has different type, cast the result type.
2427         if (Member->getType() != ScalarTy) {
2428           assert(!VF.Scalable && "VF is assumed to be non scalable.");
2429           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2430           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2431         }
2432 
2433         if (Group->isReverse())
2434           StridedVec = reverseVector(StridedVec);
2435 
2436         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2437       }
2438     }
2439     return;
2440   }
2441 
2442   // The sub vector type for current instruction.
2443   assert(!VF.Scalable && "VF is assumed to be non scalable.");
2444   auto *SubVT = VectorType::get(ScalarTy, VF);
2445 
2446   // Vectorize the interleaved store group.
2447   for (unsigned Part = 0; Part < UF; Part++) {
2448     // Collect the stored vector from each member.
2449     SmallVector<Value *, 4> StoredVecs;
2450     for (unsigned i = 0; i < InterleaveFactor; i++) {
2451       // Interleaved store group doesn't allow a gap, so each index has a member
2452       Instruction *Member = Group->getMember(i);
2453       assert(Member && "Fail to get a member from an interleaved store group");
2454 
2455       Value *StoredVec = getOrCreateVectorValue(
2456           cast<StoreInst>(Member)->getValueOperand(), Part);
2457       if (Group->isReverse())
2458         StoredVec = reverseVector(StoredVec);
2459 
2460       // If this member has different type, cast it to a unified type.
2461 
2462       if (StoredVec->getType() != SubVT)
2463         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2464 
2465       StoredVecs.push_back(StoredVec);
2466     }
2467 
2468     // Concatenate all vectors into a wide vector.
2469     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2470 
2471     // Interleave the elements in the wide vector.
2472     assert(!VF.Scalable && "scalable vectors not yet supported.");
2473     Value *IVec = Builder.CreateShuffleVector(
2474         WideVec, UndefVec, createInterleaveMask(VF.Min, InterleaveFactor),
2475         "interleaved.vec");
2476 
2477     Instruction *NewStoreInstr;
2478     if (BlockInMask) {
2479       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2480       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2481       Value *ShuffledMask = Builder.CreateShuffleVector(
2482           BlockInMaskPart, Undefs,
2483           createReplicatedMask(InterleaveFactor, VF.Min), "interleaved.mask");
2484       NewStoreInstr = Builder.CreateMaskedStore(
2485           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2486     }
2487     else
2488       NewStoreInstr =
2489           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2490 
2491     Group->addMetadata(NewStoreInstr);
2492   }
2493 }
2494 
2495 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2496                                                      VPTransformState &State,
2497                                                      VPValue *Addr,
2498                                                      VPValue *StoredValue,
2499                                                      VPValue *BlockInMask) {
2500   // Attempt to issue a wide load.
2501   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2502   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2503 
2504   assert((LI || SI) && "Invalid Load/Store instruction");
2505   assert((!SI || StoredValue) && "No stored value provided for widened store");
2506   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2507 
2508   LoopVectorizationCostModel::InstWidening Decision =
2509       Cost->getWideningDecision(Instr, VF);
2510   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2511           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2512           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2513          "CM decision is not to widen the memory instruction");
2514 
2515   Type *ScalarDataTy = getMemInstValueType(Instr);
2516 
2517   assert(!VF.Scalable && "scalable vectors not yet supported.");
2518   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2519   const Align Alignment = getLoadStoreAlignment(Instr);
2520 
2521   // Determine if the pointer operand of the access is either consecutive or
2522   // reverse consecutive.
2523   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2524   bool ConsecutiveStride =
2525       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2526   bool CreateGatherScatter =
2527       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2528 
2529   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2530   // gather/scatter. Otherwise Decision should have been to Scalarize.
2531   assert((ConsecutiveStride || CreateGatherScatter) &&
2532          "The instruction should be scalarized");
2533   (void)ConsecutiveStride;
2534 
2535   VectorParts BlockInMaskParts(UF);
2536   bool isMaskRequired = BlockInMask;
2537   if (isMaskRequired)
2538     for (unsigned Part = 0; Part < UF; ++Part)
2539       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2540 
2541   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2542     // Calculate the pointer for the specific unroll-part.
2543     GetElementPtrInst *PartPtr = nullptr;
2544 
2545     bool InBounds = false;
2546     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2547       InBounds = gep->isInBounds();
2548 
2549     if (Reverse) {
2550       // If the address is consecutive but reversed, then the
2551       // wide store needs to start at the last vector element.
2552       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2553           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.Min)));
2554       PartPtr->setIsInBounds(InBounds);
2555       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2556           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.Min)));
2557       PartPtr->setIsInBounds(InBounds);
2558       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2559         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2560     } else {
2561       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2562           ScalarDataTy, Ptr, Builder.getInt32(Part * VF.Min)));
2563       PartPtr->setIsInBounds(InBounds);
2564     }
2565 
2566     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2567     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2568   };
2569 
2570   // Handle Stores:
2571   if (SI) {
2572     setDebugLocFromInst(Builder, SI);
2573 
2574     for (unsigned Part = 0; Part < UF; ++Part) {
2575       Instruction *NewSI = nullptr;
2576       Value *StoredVal = State.get(StoredValue, Part);
2577       if (CreateGatherScatter) {
2578         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2579         Value *VectorGep = State.get(Addr, Part);
2580         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2581                                             MaskPart);
2582       } else {
2583         if (Reverse) {
2584           // If we store to reverse consecutive memory locations, then we need
2585           // to reverse the order of elements in the stored value.
2586           StoredVal = reverseVector(StoredVal);
2587           // We don't want to update the value in the map as it might be used in
2588           // another expression. So don't call resetVectorValue(StoredVal).
2589         }
2590         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2591         if (isMaskRequired)
2592           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2593                                             BlockInMaskParts[Part]);
2594         else
2595           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2596       }
2597       addMetadata(NewSI, SI);
2598     }
2599     return;
2600   }
2601 
2602   // Handle loads.
2603   assert(LI && "Must have a load instruction");
2604   setDebugLocFromInst(Builder, LI);
2605   for (unsigned Part = 0; Part < UF; ++Part) {
2606     Value *NewLI;
2607     if (CreateGatherScatter) {
2608       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2609       Value *VectorGep = State.get(Addr, Part);
2610       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2611                                          nullptr, "wide.masked.gather");
2612       addMetadata(NewLI, LI);
2613     } else {
2614       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2615       if (isMaskRequired)
2616         NewLI = Builder.CreateMaskedLoad(
2617             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2618             "wide.masked.load");
2619       else
2620         NewLI =
2621             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2622 
2623       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2624       addMetadata(NewLI, LI);
2625       if (Reverse)
2626         NewLI = reverseVector(NewLI);
2627     }
2628     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2629   }
2630 }
2631 
2632 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2633                                                const VPIteration &Instance,
2634                                                bool IfPredicateInstr,
2635                                                VPTransformState &State) {
2636   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2637 
2638   setDebugLocFromInst(Builder, Instr);
2639 
2640   // Does this instruction return a value ?
2641   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2642 
2643   Instruction *Cloned = Instr->clone();
2644   if (!IsVoidRetTy)
2645     Cloned->setName(Instr->getName() + ".cloned");
2646 
2647   // Replace the operands of the cloned instructions with their scalar
2648   // equivalents in the new loop.
2649   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2650     auto *NewOp = State.get(User.getOperand(op), Instance);
2651     Cloned->setOperand(op, NewOp);
2652   }
2653   addNewMetadata(Cloned, Instr);
2654 
2655   // Place the cloned scalar in the new loop.
2656   Builder.Insert(Cloned);
2657 
2658   // Add the cloned scalar to the scalar map entry.
2659   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2660 
2661   // If we just cloned a new assumption, add it the assumption cache.
2662   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2663     if (II->getIntrinsicID() == Intrinsic::assume)
2664       AC->registerAssumption(II);
2665 
2666   // End if-block.
2667   if (IfPredicateInstr)
2668     PredicatedInstructions.push_back(Cloned);
2669 }
2670 
2671 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2672                                                       Value *End, Value *Step,
2673                                                       Instruction *DL) {
2674   BasicBlock *Header = L->getHeader();
2675   BasicBlock *Latch = L->getLoopLatch();
2676   // As we're just creating this loop, it's possible no latch exists
2677   // yet. If so, use the header as this will be a single block loop.
2678   if (!Latch)
2679     Latch = Header;
2680 
2681   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2682   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2683   setDebugLocFromInst(Builder, OldInst);
2684   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2685 
2686   Builder.SetInsertPoint(Latch->getTerminator());
2687   setDebugLocFromInst(Builder, OldInst);
2688 
2689   // Create i+1 and fill the PHINode.
2690   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2691   Induction->addIncoming(Start, L->getLoopPreheader());
2692   Induction->addIncoming(Next, Latch);
2693   // Create the compare.
2694   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2695   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2696 
2697   // Now we have two terminators. Remove the old one from the block.
2698   Latch->getTerminator()->eraseFromParent();
2699 
2700   return Induction;
2701 }
2702 
2703 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2704   if (TripCount)
2705     return TripCount;
2706 
2707   assert(L && "Create Trip Count for null loop.");
2708   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2709   // Find the loop boundaries.
2710   ScalarEvolution *SE = PSE.getSE();
2711   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2712   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2713          "Invalid loop count");
2714 
2715   Type *IdxTy = Legal->getWidestInductionType();
2716   assert(IdxTy && "No type for induction");
2717 
2718   // The exit count might have the type of i64 while the phi is i32. This can
2719   // happen if we have an induction variable that is sign extended before the
2720   // compare. The only way that we get a backedge taken count is that the
2721   // induction variable was signed and as such will not overflow. In such a case
2722   // truncation is legal.
2723   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2724       IdxTy->getPrimitiveSizeInBits())
2725     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2726   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2727 
2728   // Get the total trip count from the count by adding 1.
2729   const SCEV *ExitCount = SE->getAddExpr(
2730       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2731 
2732   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2733 
2734   // Expand the trip count and place the new instructions in the preheader.
2735   // Notice that the pre-header does not change, only the loop body.
2736   SCEVExpander Exp(*SE, DL, "induction");
2737 
2738   // Count holds the overall loop count (N).
2739   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2740                                 L->getLoopPreheader()->getTerminator());
2741 
2742   if (TripCount->getType()->isPointerTy())
2743     TripCount =
2744         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2745                                     L->getLoopPreheader()->getTerminator());
2746 
2747   return TripCount;
2748 }
2749 
2750 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2751   if (VectorTripCount)
2752     return VectorTripCount;
2753 
2754   Value *TC = getOrCreateTripCount(L);
2755   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2756 
2757   Type *Ty = TC->getType();
2758   // This is where we can make the step a runtime constant.
2759   assert(!VF.Scalable && "scalable vectorization is not supported yet");
2760   Constant *Step = ConstantInt::get(Ty, VF.Min * UF);
2761 
2762   // If the tail is to be folded by masking, round the number of iterations N
2763   // up to a multiple of Step instead of rounding down. This is done by first
2764   // adding Step-1 and then rounding down. Note that it's ok if this addition
2765   // overflows: the vector induction variable will eventually wrap to zero given
2766   // that it starts at zero and its Step is a power of two; the loop will then
2767   // exit, with the last early-exit vector comparison also producing all-true.
2768   if (Cost->foldTailByMasking()) {
2769     assert(isPowerOf2_32(VF.Min * UF) &&
2770            "VF*UF must be a power of 2 when folding tail by masking");
2771     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF.Min * UF - 1),
2772                            "n.rnd.up");
2773   }
2774 
2775   // Now we need to generate the expression for the part of the loop that the
2776   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2777   // iterations are not required for correctness, or N - Step, otherwise. Step
2778   // is equal to the vectorization factor (number of SIMD elements) times the
2779   // unroll factor (number of SIMD instructions).
2780   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2781 
2782   // If there is a non-reversed interleaved group that may speculatively access
2783   // memory out-of-bounds, we need to ensure that there will be at least one
2784   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2785   // the trip count, we set the remainder to be equal to the step. If the step
2786   // does not evenly divide the trip count, no adjustment is necessary since
2787   // there will already be scalar iterations. Note that the minimum iterations
2788   // check ensures that N >= Step.
2789   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
2790     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2791     R = Builder.CreateSelect(IsZero, Step, R);
2792   }
2793 
2794   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2795 
2796   return VectorTripCount;
2797 }
2798 
2799 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2800                                                    const DataLayout &DL) {
2801   // Verify that V is a vector type with same number of elements as DstVTy.
2802   assert(isa<FixedVectorType>(DstVTy) &&
2803          "Vector type is assumed to be fixed width.");
2804   unsigned VF = DstVTy->getNumElements();
2805   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2806   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2807   Type *SrcElemTy = SrcVecTy->getElementType();
2808   Type *DstElemTy = DstVTy->getElementType();
2809   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2810          "Vector elements must have same size");
2811 
2812   // Do a direct cast if element types are castable.
2813   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2814     return Builder.CreateBitOrPointerCast(V, DstVTy);
2815   }
2816   // V cannot be directly casted to desired vector type.
2817   // May happen when V is a floating point vector but DstVTy is a vector of
2818   // pointers or vice-versa. Handle this using a two-step bitcast using an
2819   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2820   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2821          "Only one type should be a pointer type");
2822   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2823          "Only one type should be a floating point type");
2824   Type *IntTy =
2825       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2826   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2827   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2828   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2829 }
2830 
2831 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2832                                                          BasicBlock *Bypass) {
2833   Value *Count = getOrCreateTripCount(L);
2834   // Reuse existing vector loop preheader for TC checks.
2835   // Note that new preheader block is generated for vector loop.
2836   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2837   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2838 
2839   // Generate code to check if the loop's trip count is less than VF * UF, or
2840   // equal to it in case a scalar epilogue is required; this implies that the
2841   // vector trip count is zero. This check also covers the case where adding one
2842   // to the backedge-taken count overflowed leading to an incorrect trip count
2843   // of zero. In this case we will also jump to the scalar loop.
2844   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2845                                           : ICmpInst::ICMP_ULT;
2846 
2847   // If tail is to be folded, vector loop takes care of all iterations.
2848   Value *CheckMinIters = Builder.getFalse();
2849   if (!Cost->foldTailByMasking()) {
2850     assert(!VF.Scalable && "scalable vectors not yet supported.");
2851     CheckMinIters = Builder.CreateICmp(
2852         P, Count, ConstantInt::get(Count->getType(), VF.Min * UF),
2853         "min.iters.check");
2854   }
2855   // Create new preheader for vector loop.
2856   LoopVectorPreHeader =
2857       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2858                  "vector.ph");
2859 
2860   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2861                                DT->getNode(Bypass)->getIDom()) &&
2862          "TC check is expected to dominate Bypass");
2863 
2864   // Update dominator for Bypass & LoopExit.
2865   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2866   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2867 
2868   ReplaceInstWithInst(
2869       TCCheckBlock->getTerminator(),
2870       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2871   LoopBypassBlocks.push_back(TCCheckBlock);
2872 }
2873 
2874 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2875   // Reuse existing vector loop preheader for SCEV checks.
2876   // Note that new preheader block is generated for vector loop.
2877   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2878 
2879   // Generate the code to check that the SCEV assumptions that we made.
2880   // We want the new basic block to start at the first instruction in a
2881   // sequence of instructions that form a check.
2882   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2883                    "scev.check");
2884   Value *SCEVCheck = Exp.expandCodeForPredicate(
2885       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2886 
2887   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2888     if (C->isZero())
2889       return;
2890 
2891   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2892            (OptForSizeBasedOnProfile &&
2893             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2894          "Cannot SCEV check stride or overflow when optimizing for size");
2895 
2896   SCEVCheckBlock->setName("vector.scevcheck");
2897   // Create new preheader for vector loop.
2898   LoopVectorPreHeader =
2899       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2900                  nullptr, "vector.ph");
2901 
2902   // Update dominator only if this is first RT check.
2903   if (LoopBypassBlocks.empty()) {
2904     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2905     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2906   }
2907 
2908   ReplaceInstWithInst(
2909       SCEVCheckBlock->getTerminator(),
2910       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2911   LoopBypassBlocks.push_back(SCEVCheckBlock);
2912   AddedSafetyChecks = true;
2913 }
2914 
2915 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2916   // VPlan-native path does not do any analysis for runtime checks currently.
2917   if (EnableVPlanNativePath)
2918     return;
2919 
2920   // Reuse existing vector loop preheader for runtime memory checks.
2921   // Note that new preheader block is generated for vector loop.
2922   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2923 
2924   // Generate the code that checks in runtime if arrays overlap. We put the
2925   // checks into a separate block to make the more common case of few elements
2926   // faster.
2927   auto *LAI = Legal->getLAI();
2928   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2929   if (!RtPtrChecking.Need)
2930     return;
2931   Instruction *FirstCheckInst;
2932   Instruction *MemRuntimeCheck;
2933   std::tie(FirstCheckInst, MemRuntimeCheck) =
2934       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2935                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2936   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
2937                             "claimed checks are required");
2938 
2939   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2940     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2941            "Cannot emit memory checks when optimizing for size, unless forced "
2942            "to vectorize.");
2943     ORE->emit([&]() {
2944       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2945                                         L->getStartLoc(), L->getHeader())
2946              << "Code-size may be reduced by not forcing "
2947                 "vectorization, or by source-code modifications "
2948                 "eliminating the need for runtime checks "
2949                 "(e.g., adding 'restrict').";
2950     });
2951   }
2952 
2953   MemCheckBlock->setName("vector.memcheck");
2954   // Create new preheader for vector loop.
2955   LoopVectorPreHeader =
2956       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2957                  "vector.ph");
2958 
2959   // Update dominator only if this is first RT check.
2960   if (LoopBypassBlocks.empty()) {
2961     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2962     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2963   }
2964 
2965   ReplaceInstWithInst(
2966       MemCheckBlock->getTerminator(),
2967       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2968   LoopBypassBlocks.push_back(MemCheckBlock);
2969   AddedSafetyChecks = true;
2970 
2971   // We currently don't use LoopVersioning for the actual loop cloning but we
2972   // still use it to add the noalias metadata.
2973   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2974                                           PSE.getSE());
2975   LVer->prepareNoAliasMetadata();
2976 }
2977 
2978 Value *InnerLoopVectorizer::emitTransformedIndex(
2979     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2980     const InductionDescriptor &ID) const {
2981 
2982   SCEVExpander Exp(*SE, DL, "induction");
2983   auto Step = ID.getStep();
2984   auto StartValue = ID.getStartValue();
2985   assert(Index->getType() == Step->getType() &&
2986          "Index type does not match StepValue type");
2987 
2988   // Note: the IR at this point is broken. We cannot use SE to create any new
2989   // SCEV and then expand it, hoping that SCEV's simplification will give us
2990   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2991   // lead to various SCEV crashes. So all we can do is to use builder and rely
2992   // on InstCombine for future simplifications. Here we handle some trivial
2993   // cases only.
2994   auto CreateAdd = [&B](Value *X, Value *Y) {
2995     assert(X->getType() == Y->getType() && "Types don't match!");
2996     if (auto *CX = dyn_cast<ConstantInt>(X))
2997       if (CX->isZero())
2998         return Y;
2999     if (auto *CY = dyn_cast<ConstantInt>(Y))
3000       if (CY->isZero())
3001         return X;
3002     return B.CreateAdd(X, Y);
3003   };
3004 
3005   auto CreateMul = [&B](Value *X, Value *Y) {
3006     assert(X->getType() == Y->getType() && "Types don't match!");
3007     if (auto *CX = dyn_cast<ConstantInt>(X))
3008       if (CX->isOne())
3009         return Y;
3010     if (auto *CY = dyn_cast<ConstantInt>(Y))
3011       if (CY->isOne())
3012         return X;
3013     return B.CreateMul(X, Y);
3014   };
3015 
3016   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3017   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3018   // the DomTree is not kept up-to-date for additional blocks generated in the
3019   // vector loop. By using the header as insertion point, we guarantee that the
3020   // expanded instructions dominate all their uses.
3021   auto GetInsertPoint = [this, &B]() {
3022     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3023     if (InsertBB != LoopVectorBody &&
3024         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3025       return LoopVectorBody->getTerminator();
3026     return &*B.GetInsertPoint();
3027   };
3028   switch (ID.getKind()) {
3029   case InductionDescriptor::IK_IntInduction: {
3030     assert(Index->getType() == StartValue->getType() &&
3031            "Index type does not match StartValue type");
3032     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3033       return B.CreateSub(StartValue, Index);
3034     auto *Offset = CreateMul(
3035         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3036     return CreateAdd(StartValue, Offset);
3037   }
3038   case InductionDescriptor::IK_PtrInduction: {
3039     assert(isa<SCEVConstant>(Step) &&
3040            "Expected constant step for pointer induction");
3041     return B.CreateGEP(
3042         StartValue->getType()->getPointerElementType(), StartValue,
3043         CreateMul(Index,
3044                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3045   }
3046   case InductionDescriptor::IK_FpInduction: {
3047     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3048     auto InductionBinOp = ID.getInductionBinOp();
3049     assert(InductionBinOp &&
3050            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3051             InductionBinOp->getOpcode() == Instruction::FSub) &&
3052            "Original bin op should be defined for FP induction");
3053 
3054     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3055 
3056     // Floating point operations had to be 'fast' to enable the induction.
3057     FastMathFlags Flags;
3058     Flags.setFast();
3059 
3060     Value *MulExp = B.CreateFMul(StepValue, Index);
3061     if (isa<Instruction>(MulExp))
3062       // We have to check, the MulExp may be a constant.
3063       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3064 
3065     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3066                                "induction");
3067     if (isa<Instruction>(BOp))
3068       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3069 
3070     return BOp;
3071   }
3072   case InductionDescriptor::IK_NoInduction:
3073     return nullptr;
3074   }
3075   llvm_unreachable("invalid enum");
3076 }
3077 
3078 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3079   LoopScalarBody = OrigLoop->getHeader();
3080   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3081   LoopExitBlock = OrigLoop->getExitBlock();
3082   assert(LoopExitBlock && "Must have an exit block");
3083   assert(LoopVectorPreHeader && "Invalid loop structure");
3084 
3085   LoopMiddleBlock =
3086       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3087                  LI, nullptr, Twine(Prefix) + "middle.block");
3088   LoopScalarPreHeader =
3089       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3090                  nullptr, Twine(Prefix) + "scalar.ph");
3091   // We intentionally don't let SplitBlock to update LoopInfo since
3092   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3093   // LoopVectorBody is explicitly added to the correct place few lines later.
3094   LoopVectorBody =
3095       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3096                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3097 
3098   // Update dominator for loop exit.
3099   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3100 
3101   // Create and register the new vector loop.
3102   Loop *Lp = LI->AllocateLoop();
3103   Loop *ParentLoop = OrigLoop->getParentLoop();
3104 
3105   // Insert the new loop into the loop nest and register the new basic blocks
3106   // before calling any utilities such as SCEV that require valid LoopInfo.
3107   if (ParentLoop) {
3108     ParentLoop->addChildLoop(Lp);
3109   } else {
3110     LI->addTopLevelLoop(Lp);
3111   }
3112   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3113   return Lp;
3114 }
3115 
3116 void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
3117                                                       Value *VectorTripCount) {
3118   assert(VectorTripCount && L && "Expected valid arguments");
3119   // We are going to resume the execution of the scalar loop.
3120   // Go over all of the induction variables that we found and fix the
3121   // PHIs that are left in the scalar version of the loop.
3122   // The starting values of PHI nodes depend on the counter of the last
3123   // iteration in the vectorized loop.
3124   // If we come from a bypass edge then we need to start from the original
3125   // start value.
3126   for (auto &InductionEntry : Legal->getInductionVars()) {
3127     PHINode *OrigPhi = InductionEntry.first;
3128     InductionDescriptor II = InductionEntry.second;
3129 
3130     // Create phi nodes to merge from the  backedge-taken check block.
3131     PHINode *BCResumeVal =
3132         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3133                         LoopScalarPreHeader->getTerminator());
3134     // Copy original phi DL over to the new one.
3135     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3136     Value *&EndValue = IVEndValues[OrigPhi];
3137     if (OrigPhi == OldInduction) {
3138       // We know what the end value is.
3139       EndValue = VectorTripCount;
3140     } else {
3141       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3142       Type *StepType = II.getStep()->getType();
3143       Instruction::CastOps CastOp =
3144           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3145       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3146       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3147       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3148       EndValue->setName("ind.end");
3149     }
3150 
3151     // The new PHI merges the original incoming value, in case of a bypass,
3152     // or the value at the end of the vectorized loop.
3153     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3154 
3155     // Fix the scalar body counter (PHI node).
3156     // The old induction's phi node in the scalar body needs the truncated
3157     // value.
3158     for (BasicBlock *BB : LoopBypassBlocks)
3159       BCResumeVal->addIncoming(II.getStartValue(), BB);
3160     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3161   }
3162 }
3163 
3164 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3165                                                       MDNode *OrigLoopID) {
3166   assert(L && "Expected valid loop.");
3167 
3168   // The trip counts should be cached by now.
3169   Value *Count = getOrCreateTripCount(L);
3170   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3171 
3172   // We need the OrigLoop (scalar loop part) latch terminator to help
3173   // produce correct debug info for the middle block BB instructions.
3174   // The legality check stage guarantees that the loop will have a single
3175   // latch.
3176   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3177          "Scalar loop latch terminator isn't a branch");
3178   BranchInst *ScalarLatchBr =
3179       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3180 
3181   // Add a check in the middle block to see if we have completed
3182   // all of the iterations in the first vector loop.
3183   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3184   // If tail is to be folded, we know we don't need to run the remainder.
3185   Value *CmpN = Builder.getTrue();
3186   if (!Cost->foldTailByMasking()) {
3187     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3188                            VectorTripCount, "cmp.n",
3189                            LoopMiddleBlock->getTerminator());
3190 
3191     // Here we use the same DebugLoc as the scalar loop latch branch instead
3192     // of the corresponding compare because they may have ended up with
3193     // different line numbers and we want to avoid awkward line stepping while
3194     // debugging. Eg. if the compare has got a line number inside the loop.
3195     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3196   }
3197 
3198   BranchInst *BrInst =
3199       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3200   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3201   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3202 
3203   // Get ready to start creating new instructions into the vectorized body.
3204   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3205          "Inconsistent vector loop preheader");
3206   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3207 
3208   Optional<MDNode *> VectorizedLoopID =
3209       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3210                                       LLVMLoopVectorizeFollowupVectorized});
3211   if (VectorizedLoopID.hasValue()) {
3212     L->setLoopID(VectorizedLoopID.getValue());
3213 
3214     // Do not setAlreadyVectorized if loop attributes have been defined
3215     // explicitly.
3216     return LoopVectorPreHeader;
3217   }
3218 
3219   // Keep all loop hints from the original loop on the vector loop (we'll
3220   // replace the vectorizer-specific hints below).
3221   if (MDNode *LID = OrigLoop->getLoopID())
3222     L->setLoopID(LID);
3223 
3224   LoopVectorizeHints Hints(L, true, *ORE);
3225   Hints.setAlreadyVectorized();
3226 
3227 #ifdef EXPENSIVE_CHECKS
3228   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3229   LI->verify(*DT);
3230 #endif
3231 
3232   return LoopVectorPreHeader;
3233 }
3234 
3235 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3236   /*
3237    In this function we generate a new loop. The new loop will contain
3238    the vectorized instructions while the old loop will continue to run the
3239    scalar remainder.
3240 
3241        [ ] <-- loop iteration number check.
3242     /   |
3243    /    v
3244   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3245   |  /  |
3246   | /   v
3247   ||   [ ]     <-- vector pre header.
3248   |/    |
3249   |     v
3250   |    [  ] \
3251   |    [  ]_|   <-- vector loop.
3252   |     |
3253   |     v
3254   |   -[ ]   <--- middle-block.
3255   |  /  |
3256   | /   v
3257   -|- >[ ]     <--- new preheader.
3258    |    |
3259    |    v
3260    |   [ ] \
3261    |   [ ]_|   <-- old scalar loop to handle remainder.
3262     \   |
3263      \  v
3264       >[ ]     <-- exit block.
3265    ...
3266    */
3267 
3268   // Get the metadata of the original loop before it gets modified.
3269   MDNode *OrigLoopID = OrigLoop->getLoopID();
3270 
3271   // Create an empty vector loop, and prepare basic blocks for the runtime
3272   // checks.
3273   Loop *Lp = createVectorLoopSkeleton("");
3274 
3275   // Now, compare the new count to zero. If it is zero skip the vector loop and
3276   // jump to the scalar loop. This check also covers the case where the
3277   // backedge-taken count is uint##_max: adding one to it will overflow leading
3278   // to an incorrect trip count of zero. In this (rare) case we will also jump
3279   // to the scalar loop.
3280   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3281 
3282   // Generate the code to check any assumptions that we've made for SCEV
3283   // expressions.
3284   emitSCEVChecks(Lp, LoopScalarPreHeader);
3285 
3286   // Generate the code that checks in runtime if arrays overlap. We put the
3287   // checks into a separate block to make the more common case of few elements
3288   // faster.
3289   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3290 
3291   // Some loops have a single integer induction variable, while other loops
3292   // don't. One example is c++ iterators that often have multiple pointer
3293   // induction variables. In the code below we also support a case where we
3294   // don't have a single induction variable.
3295   //
3296   // We try to obtain an induction variable from the original loop as hard
3297   // as possible. However if we don't find one that:
3298   //   - is an integer
3299   //   - counts from zero, stepping by one
3300   //   - is the size of the widest induction variable type
3301   // then we create a new one.
3302   OldInduction = Legal->getPrimaryInduction();
3303   Type *IdxTy = Legal->getWidestInductionType();
3304   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3305   // The loop step is equal to the vectorization factor (num of SIMD elements)
3306   // times the unroll factor (num of SIMD instructions).
3307   assert(!VF.Scalable && "scalable vectors not yet supported.");
3308   Constant *Step = ConstantInt::get(IdxTy, VF.Min * UF);
3309   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3310   Induction =
3311       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3312                               getDebugLocFromInstOrOperands(OldInduction));
3313 
3314   // Emit phis for the new starting index of the scalar loop.
3315   createInductionResumeValues(Lp, CountRoundDown);
3316 
3317   return completeLoopSkeleton(Lp, OrigLoopID);
3318 }
3319 
3320 // Fix up external users of the induction variable. At this point, we are
3321 // in LCSSA form, with all external PHIs that use the IV having one input value,
3322 // coming from the remainder loop. We need those PHIs to also have a correct
3323 // value for the IV when arriving directly from the middle block.
3324 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3325                                        const InductionDescriptor &II,
3326                                        Value *CountRoundDown, Value *EndValue,
3327                                        BasicBlock *MiddleBlock) {
3328   // There are two kinds of external IV usages - those that use the value
3329   // computed in the last iteration (the PHI) and those that use the penultimate
3330   // value (the value that feeds into the phi from the loop latch).
3331   // We allow both, but they, obviously, have different values.
3332 
3333   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3334 
3335   DenseMap<Value *, Value *> MissingVals;
3336 
3337   // An external user of the last iteration's value should see the value that
3338   // the remainder loop uses to initialize its own IV.
3339   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3340   for (User *U : PostInc->users()) {
3341     Instruction *UI = cast<Instruction>(U);
3342     if (!OrigLoop->contains(UI)) {
3343       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3344       MissingVals[UI] = EndValue;
3345     }
3346   }
3347 
3348   // An external user of the penultimate value need to see EndValue - Step.
3349   // The simplest way to get this is to recompute it from the constituent SCEVs,
3350   // that is Start + (Step * (CRD - 1)).
3351   for (User *U : OrigPhi->users()) {
3352     auto *UI = cast<Instruction>(U);
3353     if (!OrigLoop->contains(UI)) {
3354       const DataLayout &DL =
3355           OrigLoop->getHeader()->getModule()->getDataLayout();
3356       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3357 
3358       IRBuilder<> B(MiddleBlock->getTerminator());
3359       Value *CountMinusOne = B.CreateSub(
3360           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3361       Value *CMO =
3362           !II.getStep()->getType()->isIntegerTy()
3363               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3364                              II.getStep()->getType())
3365               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3366       CMO->setName("cast.cmo");
3367       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3368       Escape->setName("ind.escape");
3369       MissingVals[UI] = Escape;
3370     }
3371   }
3372 
3373   for (auto &I : MissingVals) {
3374     PHINode *PHI = cast<PHINode>(I.first);
3375     // One corner case we have to handle is two IVs "chasing" each-other,
3376     // that is %IV2 = phi [...], [ %IV1, %latch ]
3377     // In this case, if IV1 has an external use, we need to avoid adding both
3378     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3379     // don't already have an incoming value for the middle block.
3380     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3381       PHI->addIncoming(I.second, MiddleBlock);
3382   }
3383 }
3384 
3385 namespace {
3386 
3387 struct CSEDenseMapInfo {
3388   static bool canHandle(const Instruction *I) {
3389     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3390            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3391   }
3392 
3393   static inline Instruction *getEmptyKey() {
3394     return DenseMapInfo<Instruction *>::getEmptyKey();
3395   }
3396 
3397   static inline Instruction *getTombstoneKey() {
3398     return DenseMapInfo<Instruction *>::getTombstoneKey();
3399   }
3400 
3401   static unsigned getHashValue(const Instruction *I) {
3402     assert(canHandle(I) && "Unknown instruction!");
3403     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3404                                                            I->value_op_end()));
3405   }
3406 
3407   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3408     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3409         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3410       return LHS == RHS;
3411     return LHS->isIdenticalTo(RHS);
3412   }
3413 };
3414 
3415 } // end anonymous namespace
3416 
3417 ///Perform cse of induction variable instructions.
3418 static void cse(BasicBlock *BB) {
3419   // Perform simple cse.
3420   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3421   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3422     Instruction *In = &*I++;
3423 
3424     if (!CSEDenseMapInfo::canHandle(In))
3425       continue;
3426 
3427     // Check if we can replace this instruction with any of the
3428     // visited instructions.
3429     if (Instruction *V = CSEMap.lookup(In)) {
3430       In->replaceAllUsesWith(V);
3431       In->eraseFromParent();
3432       continue;
3433     }
3434 
3435     CSEMap[In] = In;
3436   }
3437 }
3438 
3439 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3440                                                        ElementCount VF,
3441                                                        bool &NeedToScalarize) {
3442   assert(!VF.Scalable && "scalable vectors not yet supported.");
3443   Function *F = CI->getCalledFunction();
3444   Type *ScalarRetTy = CI->getType();
3445   SmallVector<Type *, 4> Tys, ScalarTys;
3446   for (auto &ArgOp : CI->arg_operands())
3447     ScalarTys.push_back(ArgOp->getType());
3448 
3449   // Estimate cost of scalarized vector call. The source operands are assumed
3450   // to be vectors, so we need to extract individual elements from there,
3451   // execute VF scalar calls, and then gather the result into the vector return
3452   // value.
3453   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3454                                                  TTI::TCK_RecipThroughput);
3455   if (VF.isScalar())
3456     return ScalarCallCost;
3457 
3458   // Compute corresponding vector type for return value and arguments.
3459   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3460   for (Type *ScalarTy : ScalarTys)
3461     Tys.push_back(ToVectorTy(ScalarTy, VF));
3462 
3463   // Compute costs of unpacking argument values for the scalar calls and
3464   // packing the return values to a vector.
3465   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3466 
3467   unsigned Cost = ScalarCallCost * VF.Min + ScalarizationCost;
3468 
3469   // If we can't emit a vector call for this function, then the currently found
3470   // cost is the cost we need to return.
3471   NeedToScalarize = true;
3472   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3473   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3474 
3475   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3476     return Cost;
3477 
3478   // If the corresponding vector cost is cheaper, return its cost.
3479   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3480                                                  TTI::TCK_RecipThroughput);
3481   if (VectorCallCost < Cost) {
3482     NeedToScalarize = false;
3483     return VectorCallCost;
3484   }
3485   return Cost;
3486 }
3487 
3488 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3489                                                             ElementCount VF) {
3490   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3491   assert(ID && "Expected intrinsic call!");
3492 
3493   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3494   return TTI.getIntrinsicInstrCost(CostAttrs,
3495                                    TargetTransformInfo::TCK_RecipThroughput);
3496 }
3497 
3498 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3499   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3500   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3501   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3502 }
3503 
3504 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3505   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3506   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3507   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3508 }
3509 
3510 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3511   // For every instruction `I` in MinBWs, truncate the operands, create a
3512   // truncated version of `I` and reextend its result. InstCombine runs
3513   // later and will remove any ext/trunc pairs.
3514   SmallPtrSet<Value *, 4> Erased;
3515   for (const auto &KV : Cost->getMinimalBitwidths()) {
3516     // If the value wasn't vectorized, we must maintain the original scalar
3517     // type. The absence of the value from VectorLoopValueMap indicates that it
3518     // wasn't vectorized.
3519     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3520       continue;
3521     for (unsigned Part = 0; Part < UF; ++Part) {
3522       Value *I = getOrCreateVectorValue(KV.first, Part);
3523       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3524         continue;
3525       Type *OriginalTy = I->getType();
3526       Type *ScalarTruncatedTy =
3527           IntegerType::get(OriginalTy->getContext(), KV.second);
3528       auto *TruncatedTy = FixedVectorType::get(
3529           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements());
3530       if (TruncatedTy == OriginalTy)
3531         continue;
3532 
3533       IRBuilder<> B(cast<Instruction>(I));
3534       auto ShrinkOperand = [&](Value *V) -> Value * {
3535         if (auto *ZI = dyn_cast<ZExtInst>(V))
3536           if (ZI->getSrcTy() == TruncatedTy)
3537             return ZI->getOperand(0);
3538         return B.CreateZExtOrTrunc(V, TruncatedTy);
3539       };
3540 
3541       // The actual instruction modification depends on the instruction type,
3542       // unfortunately.
3543       Value *NewI = nullptr;
3544       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3545         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3546                              ShrinkOperand(BO->getOperand(1)));
3547 
3548         // Any wrapping introduced by shrinking this operation shouldn't be
3549         // considered undefined behavior. So, we can't unconditionally copy
3550         // arithmetic wrapping flags to NewI.
3551         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3552       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3553         NewI =
3554             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3555                          ShrinkOperand(CI->getOperand(1)));
3556       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3557         NewI = B.CreateSelect(SI->getCondition(),
3558                               ShrinkOperand(SI->getTrueValue()),
3559                               ShrinkOperand(SI->getFalseValue()));
3560       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3561         switch (CI->getOpcode()) {
3562         default:
3563           llvm_unreachable("Unhandled cast!");
3564         case Instruction::Trunc:
3565           NewI = ShrinkOperand(CI->getOperand(0));
3566           break;
3567         case Instruction::SExt:
3568           NewI = B.CreateSExtOrTrunc(
3569               CI->getOperand(0),
3570               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3571           break;
3572         case Instruction::ZExt:
3573           NewI = B.CreateZExtOrTrunc(
3574               CI->getOperand(0),
3575               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3576           break;
3577         }
3578       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3579         auto Elements0 =
3580             cast<VectorType>(SI->getOperand(0)->getType())->getNumElements();
3581         auto *O0 = B.CreateZExtOrTrunc(
3582             SI->getOperand(0),
3583             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3584         auto Elements1 =
3585             cast<VectorType>(SI->getOperand(1)->getType())->getNumElements();
3586         auto *O1 = B.CreateZExtOrTrunc(
3587             SI->getOperand(1),
3588             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3589 
3590         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3591       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3592         // Don't do anything with the operands, just extend the result.
3593         continue;
3594       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3595         auto Elements =
3596             cast<VectorType>(IE->getOperand(0)->getType())->getNumElements();
3597         auto *O0 = B.CreateZExtOrTrunc(
3598             IE->getOperand(0),
3599             FixedVectorType::get(ScalarTruncatedTy, Elements));
3600         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3601         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3602       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3603         auto Elements =
3604             cast<VectorType>(EE->getOperand(0)->getType())->getNumElements();
3605         auto *O0 = B.CreateZExtOrTrunc(
3606             EE->getOperand(0),
3607             FixedVectorType::get(ScalarTruncatedTy, Elements));
3608         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3609       } else {
3610         // If we don't know what to do, be conservative and don't do anything.
3611         continue;
3612       }
3613 
3614       // Lastly, extend the result.
3615       NewI->takeName(cast<Instruction>(I));
3616       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3617       I->replaceAllUsesWith(Res);
3618       cast<Instruction>(I)->eraseFromParent();
3619       Erased.insert(I);
3620       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3621     }
3622   }
3623 
3624   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3625   for (const auto &KV : Cost->getMinimalBitwidths()) {
3626     // If the value wasn't vectorized, we must maintain the original scalar
3627     // type. The absence of the value from VectorLoopValueMap indicates that it
3628     // wasn't vectorized.
3629     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3630       continue;
3631     for (unsigned Part = 0; Part < UF; ++Part) {
3632       Value *I = getOrCreateVectorValue(KV.first, Part);
3633       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3634       if (Inst && Inst->use_empty()) {
3635         Value *NewI = Inst->getOperand(0);
3636         Inst->eraseFromParent();
3637         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3638       }
3639     }
3640   }
3641 }
3642 
3643 void InnerLoopVectorizer::fixVectorizedLoop() {
3644   // Insert truncates and extends for any truncated instructions as hints to
3645   // InstCombine.
3646   if (VF.isVector())
3647     truncateToMinimalBitwidths();
3648 
3649   // Fix widened non-induction PHIs by setting up the PHI operands.
3650   if (OrigPHIsToFix.size()) {
3651     assert(EnableVPlanNativePath &&
3652            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3653     fixNonInductionPHIs();
3654   }
3655 
3656   // At this point every instruction in the original loop is widened to a
3657   // vector form. Now we need to fix the recurrences in the loop. These PHI
3658   // nodes are currently empty because we did not want to introduce cycles.
3659   // This is the second stage of vectorizing recurrences.
3660   fixCrossIterationPHIs();
3661 
3662   // Forget the original basic block.
3663   PSE.getSE()->forgetLoop(OrigLoop);
3664 
3665   // Fix-up external users of the induction variables.
3666   for (auto &Entry : Legal->getInductionVars())
3667     fixupIVUsers(Entry.first, Entry.second,
3668                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3669                  IVEndValues[Entry.first], LoopMiddleBlock);
3670 
3671   fixLCSSAPHIs();
3672   for (Instruction *PI : PredicatedInstructions)
3673     sinkScalarOperands(&*PI);
3674 
3675   // Remove redundant induction instructions.
3676   cse(LoopVectorBody);
3677 
3678   // Set/update profile weights for the vector and remainder loops as original
3679   // loop iterations are now distributed among them. Note that original loop
3680   // represented by LoopScalarBody becomes remainder loop after vectorization.
3681   //
3682   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3683   // end up getting slightly roughened result but that should be OK since
3684   // profile is not inherently precise anyway. Note also possible bypass of
3685   // vector code caused by legality checks is ignored, assigning all the weight
3686   // to the vector loop, optimistically.
3687   assert(!VF.Scalable &&
3688          "cannot use scalable ElementCount to determine unroll factor");
3689   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
3690                                LI->getLoopFor(LoopVectorBody),
3691                                LI->getLoopFor(LoopScalarBody), VF.Min * UF);
3692 }
3693 
3694 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3695   // In order to support recurrences we need to be able to vectorize Phi nodes.
3696   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3697   // stage #2: We now need to fix the recurrences by adding incoming edges to
3698   // the currently empty PHI nodes. At this point every instruction in the
3699   // original loop is widened to a vector form so we can use them to construct
3700   // the incoming edges.
3701   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3702     // Handle first-order recurrences and reductions that need to be fixed.
3703     if (Legal->isFirstOrderRecurrence(&Phi))
3704       fixFirstOrderRecurrence(&Phi);
3705     else if (Legal->isReductionVariable(&Phi))
3706       fixReduction(&Phi);
3707   }
3708 }
3709 
3710 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3711   // This is the second phase of vectorizing first-order recurrences. An
3712   // overview of the transformation is described below. Suppose we have the
3713   // following loop.
3714   //
3715   //   for (int i = 0; i < n; ++i)
3716   //     b[i] = a[i] - a[i - 1];
3717   //
3718   // There is a first-order recurrence on "a". For this loop, the shorthand
3719   // scalar IR looks like:
3720   //
3721   //   scalar.ph:
3722   //     s_init = a[-1]
3723   //     br scalar.body
3724   //
3725   //   scalar.body:
3726   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3727   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3728   //     s2 = a[i]
3729   //     b[i] = s2 - s1
3730   //     br cond, scalar.body, ...
3731   //
3732   // In this example, s1 is a recurrence because it's value depends on the
3733   // previous iteration. In the first phase of vectorization, we created a
3734   // temporary value for s1. We now complete the vectorization and produce the
3735   // shorthand vector IR shown below (for VF = 4, UF = 1).
3736   //
3737   //   vector.ph:
3738   //     v_init = vector(..., ..., ..., a[-1])
3739   //     br vector.body
3740   //
3741   //   vector.body
3742   //     i = phi [0, vector.ph], [i+4, vector.body]
3743   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3744   //     v2 = a[i, i+1, i+2, i+3];
3745   //     v3 = vector(v1(3), v2(0, 1, 2))
3746   //     b[i, i+1, i+2, i+3] = v2 - v3
3747   //     br cond, vector.body, middle.block
3748   //
3749   //   middle.block:
3750   //     x = v2(3)
3751   //     br scalar.ph
3752   //
3753   //   scalar.ph:
3754   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3755   //     br scalar.body
3756   //
3757   // After execution completes the vector loop, we extract the next value of
3758   // the recurrence (x) to use as the initial value in the scalar loop.
3759 
3760   // Get the original loop preheader and single loop latch.
3761   auto *Preheader = OrigLoop->getLoopPreheader();
3762   auto *Latch = OrigLoop->getLoopLatch();
3763 
3764   // Get the initial and previous values of the scalar recurrence.
3765   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3766   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3767 
3768   // Create a vector from the initial value.
3769   auto *VectorInit = ScalarInit;
3770   if (VF.isVector()) {
3771     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3772     assert(!VF.Scalable && "VF is assumed to be non scalable.");
3773     VectorInit = Builder.CreateInsertElement(
3774         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3775         Builder.getInt32(VF.Min - 1), "vector.recur.init");
3776   }
3777 
3778   // We constructed a temporary phi node in the first phase of vectorization.
3779   // This phi node will eventually be deleted.
3780   Builder.SetInsertPoint(
3781       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3782 
3783   // Create a phi node for the new recurrence. The current value will either be
3784   // the initial value inserted into a vector or loop-varying vector value.
3785   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3786   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3787 
3788   // Get the vectorized previous value of the last part UF - 1. It appears last
3789   // among all unrolled iterations, due to the order of their construction.
3790   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3791 
3792   // Find and set the insertion point after the previous value if it is an
3793   // instruction.
3794   BasicBlock::iterator InsertPt;
3795   // Note that the previous value may have been constant-folded so it is not
3796   // guaranteed to be an instruction in the vector loop.
3797   // FIXME: Loop invariant values do not form recurrences. We should deal with
3798   //        them earlier.
3799   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3800     InsertPt = LoopVectorBody->getFirstInsertionPt();
3801   else {
3802     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3803     if (isa<PHINode>(PreviousLastPart))
3804       // If the previous value is a phi node, we should insert after all the phi
3805       // nodes in the block containing the PHI to avoid breaking basic block
3806       // verification. Note that the basic block may be different to
3807       // LoopVectorBody, in case we predicate the loop.
3808       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3809     else
3810       InsertPt = ++PreviousInst->getIterator();
3811   }
3812   Builder.SetInsertPoint(&*InsertPt);
3813 
3814   // We will construct a vector for the recurrence by combining the values for
3815   // the current and previous iterations. This is the required shuffle mask.
3816   assert(!VF.Scalable);
3817   SmallVector<int, 8> ShuffleMask(VF.Min);
3818   ShuffleMask[0] = VF.Min - 1;
3819   for (unsigned I = 1; I < VF.Min; ++I)
3820     ShuffleMask[I] = I + VF.Min - 1;
3821 
3822   // The vector from which to take the initial value for the current iteration
3823   // (actual or unrolled). Initially, this is the vector phi node.
3824   Value *Incoming = VecPhi;
3825 
3826   // Shuffle the current and previous vector and update the vector parts.
3827   for (unsigned Part = 0; Part < UF; ++Part) {
3828     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3829     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3830     auto *Shuffle =
3831         VF.isVector()
3832             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
3833             : Incoming;
3834     PhiPart->replaceAllUsesWith(Shuffle);
3835     cast<Instruction>(PhiPart)->eraseFromParent();
3836     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3837     Incoming = PreviousPart;
3838   }
3839 
3840   // Fix the latch value of the new recurrence in the vector loop.
3841   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3842 
3843   // Extract the last vector element in the middle block. This will be the
3844   // initial value for the recurrence when jumping to the scalar loop.
3845   auto *ExtractForScalar = Incoming;
3846   if (VF.isVector()) {
3847     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3848     ExtractForScalar = Builder.CreateExtractElement(
3849         ExtractForScalar, Builder.getInt32(VF.Min - 1), "vector.recur.extract");
3850   }
3851   // Extract the second last element in the middle block if the
3852   // Phi is used outside the loop. We need to extract the phi itself
3853   // and not the last element (the phi update in the current iteration). This
3854   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3855   // when the scalar loop is not run at all.
3856   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3857   if (VF.isVector())
3858     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3859         Incoming, Builder.getInt32(VF.Min - 2), "vector.recur.extract.for.phi");
3860   // When loop is unrolled without vectorizing, initialize
3861   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3862   // `Incoming`. This is analogous to the vectorized case above: extracting the
3863   // second last element when VF > 1.
3864   else if (UF > 1)
3865     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3866 
3867   // Fix the initial value of the original recurrence in the scalar loop.
3868   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3869   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3870   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3871     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3872     Start->addIncoming(Incoming, BB);
3873   }
3874 
3875   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3876   Phi->setName("scalar.recur");
3877 
3878   // Finally, fix users of the recurrence outside the loop. The users will need
3879   // either the last value of the scalar recurrence or the last value of the
3880   // vector recurrence we extracted in the middle block. Since the loop is in
3881   // LCSSA form, we just need to find all the phi nodes for the original scalar
3882   // recurrence in the exit block, and then add an edge for the middle block.
3883   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3884     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3885       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3886     }
3887   }
3888 }
3889 
3890 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3891   Constant *Zero = Builder.getInt32(0);
3892 
3893   // Get it's reduction variable descriptor.
3894   assert(Legal->isReductionVariable(Phi) &&
3895          "Unable to find the reduction variable");
3896   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3897 
3898   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3899   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3900   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3901   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3902     RdxDesc.getMinMaxRecurrenceKind();
3903   setDebugLocFromInst(Builder, ReductionStartValue);
3904   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
3905 
3906   // We need to generate a reduction vector from the incoming scalar.
3907   // To do so, we need to generate the 'identity' vector and override
3908   // one of the elements with the incoming scalar reduction. We need
3909   // to do it in the vector-loop preheader.
3910   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3911 
3912   // This is the vector-clone of the value that leaves the loop.
3913   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3914 
3915   // Find the reduction identity variable. Zero for addition, or, xor,
3916   // one for multiplication, -1 for And.
3917   Value *Identity;
3918   Value *VectorStart;
3919   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3920       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3921     // MinMax reduction have the start value as their identify.
3922     if (VF == 1 || IsInLoopReductionPhi) {
3923       VectorStart = Identity = ReductionStartValue;
3924     } else {
3925       VectorStart = Identity =
3926         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3927     }
3928   } else {
3929     // Handle other reduction kinds:
3930     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3931         RK, VecTy->getScalarType());
3932     if (VF == 1 || IsInLoopReductionPhi) {
3933       Identity = Iden;
3934       // This vector is the Identity vector where the first element is the
3935       // incoming scalar reduction.
3936       VectorStart = ReductionStartValue;
3937     } else {
3938       Identity = ConstantVector::getSplat(VF, Iden);
3939 
3940       // This vector is the Identity vector where the first element is the
3941       // incoming scalar reduction.
3942       VectorStart =
3943         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3944     }
3945   }
3946 
3947   // Wrap flags are in general invalid after vectorization, clear them.
3948   clearReductionWrapFlags(RdxDesc);
3949 
3950   // Fix the vector-loop phi.
3951 
3952   // Reductions do not have to start at zero. They can start with
3953   // any loop invariant values.
3954   BasicBlock *Latch = OrigLoop->getLoopLatch();
3955   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3956 
3957   for (unsigned Part = 0; Part < UF; ++Part) {
3958     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3959     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3960     // Make sure to add the reduction start value only to the
3961     // first unroll part.
3962     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3963     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3964     cast<PHINode>(VecRdxPhi)
3965       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3966   }
3967 
3968   // Before each round, move the insertion point right between
3969   // the PHIs and the values we are going to write.
3970   // This allows us to write both PHINodes and the extractelement
3971   // instructions.
3972   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3973 
3974   setDebugLocFromInst(Builder, LoopExitInst);
3975 
3976   // If tail is folded by masking, the vector value to leave the loop should be
3977   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3978   // instead of the former.
3979   if (Cost->foldTailByMasking()) {
3980     for (unsigned Part = 0; Part < UF; ++Part) {
3981       Value *VecLoopExitInst =
3982           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3983       Value *Sel = nullptr;
3984       for (User *U : VecLoopExitInst->users()) {
3985         if (isa<SelectInst>(U)) {
3986           assert(!Sel && "Reduction exit feeding two selects");
3987           Sel = U;
3988         } else
3989           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3990       }
3991       assert(Sel && "Reduction exit feeds no select");
3992       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3993 
3994       // If the target can create a predicated operator for the reduction at no
3995       // extra cost in the loop (for example a predicated vadd), it can be
3996       // cheaper for the select to remain in the loop than be sunk out of it,
3997       // and so use the select value for the phi instead of the old
3998       // LoopExitValue.
3999       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4000       if (PreferPredicatedReductionSelect ||
4001           TTI->preferPredicatedReductionSelect(
4002               RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()),
4003               Phi->getType(), TargetTransformInfo::ReductionFlags())) {
4004         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4005         VecRdxPhi->setIncomingValueForBlock(
4006             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4007       }
4008     }
4009   }
4010 
4011   // If the vector reduction can be performed in a smaller type, we truncate
4012   // then extend the loop exit value to enable InstCombine to evaluate the
4013   // entire expression in the smaller type.
4014   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4015     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4016     assert(!VF.Scalable && "scalable vectors not yet supported.");
4017     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4018     Builder.SetInsertPoint(
4019         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4020     VectorParts RdxParts(UF);
4021     for (unsigned Part = 0; Part < UF; ++Part) {
4022       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4023       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4024       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4025                                         : Builder.CreateZExt(Trunc, VecTy);
4026       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4027            UI != RdxParts[Part]->user_end();)
4028         if (*UI != Trunc) {
4029           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4030           RdxParts[Part] = Extnd;
4031         } else {
4032           ++UI;
4033         }
4034     }
4035     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4036     for (unsigned Part = 0; Part < UF; ++Part) {
4037       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4038       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4039     }
4040   }
4041 
4042   // Reduce all of the unrolled parts into a single vector.
4043   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4044   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4045 
4046   // The middle block terminator has already been assigned a DebugLoc here (the
4047   // OrigLoop's single latch terminator). We want the whole middle block to
4048   // appear to execute on this line because: (a) it is all compiler generated,
4049   // (b) these instructions are always executed after evaluating the latch
4050   // conditional branch, and (c) other passes may add new predecessors which
4051   // terminate on this line. This is the easiest way to ensure we don't
4052   // accidentally cause an extra step back into the loop while debugging.
4053   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4054   for (unsigned Part = 1; Part < UF; ++Part) {
4055     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4056     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4057       // Floating point operations had to be 'fast' to enable the reduction.
4058       ReducedPartRdx = addFastMathFlag(
4059           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4060                               ReducedPartRdx, "bin.rdx"),
4061           RdxDesc.getFastMathFlags());
4062     else
4063       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4064                                       RdxPart);
4065   }
4066 
4067   // Create the reduction after the loop. Note that inloop reductions create the
4068   // target reduction in the loop using a Reduction recipe.
4069   if (VF.isVector() && !IsInLoopReductionPhi) {
4070     bool NoNaN = Legal->hasFunNoNaNAttr();
4071     ReducedPartRdx =
4072         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4073     // If the reduction can be performed in a smaller type, we need to extend
4074     // the reduction to the wider type before we branch to the original loop.
4075     if (Phi->getType() != RdxDesc.getRecurrenceType())
4076       ReducedPartRdx =
4077         RdxDesc.isSigned()
4078         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4079         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4080   }
4081 
4082   // Create a phi node that merges control-flow from the backedge-taken check
4083   // block and the middle block.
4084   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4085                                         LoopScalarPreHeader->getTerminator());
4086   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4087     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4088   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4089 
4090   // Now, we need to fix the users of the reduction variable
4091   // inside and outside of the scalar remainder loop.
4092   // We know that the loop is in LCSSA form. We need to update the
4093   // PHI nodes in the exit blocks.
4094   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4095     // All PHINodes need to have a single entry edge, or two if
4096     // we already fixed them.
4097     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4098 
4099     // We found a reduction value exit-PHI. Update it with the
4100     // incoming bypass edge.
4101     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4102       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4103   } // end of the LCSSA phi scan.
4104 
4105     // Fix the scalar loop reduction variable with the incoming reduction sum
4106     // from the vector body and from the backedge value.
4107   int IncomingEdgeBlockIdx =
4108     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4109   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4110   // Pick the other block.
4111   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4112   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4113   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4114 }
4115 
4116 void InnerLoopVectorizer::clearReductionWrapFlags(
4117     RecurrenceDescriptor &RdxDesc) {
4118   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4119   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4120       RK != RecurrenceDescriptor::RK_IntegerMult)
4121     return;
4122 
4123   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4124   assert(LoopExitInstr && "null loop exit instruction");
4125   SmallVector<Instruction *, 8> Worklist;
4126   SmallPtrSet<Instruction *, 8> Visited;
4127   Worklist.push_back(LoopExitInstr);
4128   Visited.insert(LoopExitInstr);
4129 
4130   while (!Worklist.empty()) {
4131     Instruction *Cur = Worklist.pop_back_val();
4132     if (isa<OverflowingBinaryOperator>(Cur))
4133       for (unsigned Part = 0; Part < UF; ++Part) {
4134         Value *V = getOrCreateVectorValue(Cur, Part);
4135         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4136       }
4137 
4138     for (User *U : Cur->users()) {
4139       Instruction *UI = cast<Instruction>(U);
4140       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4141           Visited.insert(UI).second)
4142         Worklist.push_back(UI);
4143     }
4144   }
4145 }
4146 
4147 void InnerLoopVectorizer::fixLCSSAPHIs() {
4148   assert(!VF.Scalable && "the code below assumes fixed width vectors");
4149   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4150     if (LCSSAPhi.getNumIncomingValues() == 1) {
4151       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4152       // Non-instruction incoming values will have only one value.
4153       unsigned LastLane = 0;
4154       if (isa<Instruction>(IncomingValue))
4155         LastLane = Cost->isUniformAfterVectorization(
4156                        cast<Instruction>(IncomingValue), VF)
4157                        ? 0
4158                        : VF.Min - 1;
4159       // Can be a loop invariant incoming value or the last scalar value to be
4160       // extracted from the vectorized loop.
4161       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4162       Value *lastIncomingValue =
4163           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4164       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4165     }
4166   }
4167 }
4168 
4169 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4170   // The basic block and loop containing the predicated instruction.
4171   auto *PredBB = PredInst->getParent();
4172   auto *VectorLoop = LI->getLoopFor(PredBB);
4173 
4174   // Initialize a worklist with the operands of the predicated instruction.
4175   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4176 
4177   // Holds instructions that we need to analyze again. An instruction may be
4178   // reanalyzed if we don't yet know if we can sink it or not.
4179   SmallVector<Instruction *, 8> InstsToReanalyze;
4180 
4181   // Returns true if a given use occurs in the predicated block. Phi nodes use
4182   // their operands in their corresponding predecessor blocks.
4183   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4184     auto *I = cast<Instruction>(U.getUser());
4185     BasicBlock *BB = I->getParent();
4186     if (auto *Phi = dyn_cast<PHINode>(I))
4187       BB = Phi->getIncomingBlock(
4188           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4189     return BB == PredBB;
4190   };
4191 
4192   // Iteratively sink the scalarized operands of the predicated instruction
4193   // into the block we created for it. When an instruction is sunk, it's
4194   // operands are then added to the worklist. The algorithm ends after one pass
4195   // through the worklist doesn't sink a single instruction.
4196   bool Changed;
4197   do {
4198     // Add the instructions that need to be reanalyzed to the worklist, and
4199     // reset the changed indicator.
4200     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4201     InstsToReanalyze.clear();
4202     Changed = false;
4203 
4204     while (!Worklist.empty()) {
4205       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4206 
4207       // We can't sink an instruction if it is a phi node, is already in the
4208       // predicated block, is not in the loop, or may have side effects.
4209       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4210           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4211         continue;
4212 
4213       // It's legal to sink the instruction if all its uses occur in the
4214       // predicated block. Otherwise, there's nothing to do yet, and we may
4215       // need to reanalyze the instruction.
4216       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4217         InstsToReanalyze.push_back(I);
4218         continue;
4219       }
4220 
4221       // Move the instruction to the beginning of the predicated block, and add
4222       // it's operands to the worklist.
4223       I->moveBefore(&*PredBB->getFirstInsertionPt());
4224       Worklist.insert(I->op_begin(), I->op_end());
4225 
4226       // The sinking may have enabled other instructions to be sunk, so we will
4227       // need to iterate.
4228       Changed = true;
4229     }
4230   } while (Changed);
4231 }
4232 
4233 void InnerLoopVectorizer::fixNonInductionPHIs() {
4234   for (PHINode *OrigPhi : OrigPHIsToFix) {
4235     PHINode *NewPhi =
4236         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4237     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4238 
4239     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4240         predecessors(OrigPhi->getParent()));
4241     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4242         predecessors(NewPhi->getParent()));
4243     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4244            "Scalar and Vector BB should have the same number of predecessors");
4245 
4246     // The insertion point in Builder may be invalidated by the time we get
4247     // here. Force the Builder insertion point to something valid so that we do
4248     // not run into issues during insertion point restore in
4249     // getOrCreateVectorValue calls below.
4250     Builder.SetInsertPoint(NewPhi);
4251 
4252     // The predecessor order is preserved and we can rely on mapping between
4253     // scalar and vector block predecessors.
4254     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4255       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4256 
4257       // When looking up the new scalar/vector values to fix up, use incoming
4258       // values from original phi.
4259       Value *ScIncV =
4260           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4261 
4262       // Scalar incoming value may need a broadcast
4263       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4264       NewPhi->addIncoming(NewIncV, NewPredBB);
4265     }
4266   }
4267 }
4268 
4269 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
4270                                    unsigned UF, ElementCount VF,
4271                                    bool IsPtrLoopInvariant,
4272                                    SmallBitVector &IsIndexLoopInvariant,
4273                                    VPTransformState &State) {
4274   // Construct a vector GEP by widening the operands of the scalar GEP as
4275   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4276   // results in a vector of pointers when at least one operand of the GEP
4277   // is vector-typed. Thus, to keep the representation compact, we only use
4278   // vector-typed operands for loop-varying values.
4279 
4280   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4281     // If we are vectorizing, but the GEP has only loop-invariant operands,
4282     // the GEP we build (by only using vector-typed operands for
4283     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4284     // produce a vector of pointers, we need to either arbitrarily pick an
4285     // operand to broadcast, or broadcast a clone of the original GEP.
4286     // Here, we broadcast a clone of the original.
4287     //
4288     // TODO: If at some point we decide to scalarize instructions having
4289     //       loop-invariant operands, this special case will no longer be
4290     //       required. We would add the scalarization decision to
4291     //       collectLoopScalars() and teach getVectorValue() to broadcast
4292     //       the lane-zero scalar value.
4293     auto *Clone = Builder.Insert(GEP->clone());
4294     for (unsigned Part = 0; Part < UF; ++Part) {
4295       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4296       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4297       addMetadata(EntryPart, GEP);
4298     }
4299   } else {
4300     // If the GEP has at least one loop-varying operand, we are sure to
4301     // produce a vector of pointers. But if we are only unrolling, we want
4302     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4303     // produce with the code below will be scalar (if VF == 1) or vector
4304     // (otherwise). Note that for the unroll-only case, we still maintain
4305     // values in the vector mapping with initVector, as we do for other
4306     // instructions.
4307     for (unsigned Part = 0; Part < UF; ++Part) {
4308       // The pointer operand of the new GEP. If it's loop-invariant, we
4309       // won't broadcast it.
4310       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4311                                      : State.get(Operands.getOperand(0), Part);
4312 
4313       // Collect all the indices for the new GEP. If any index is
4314       // loop-invariant, we won't broadcast it.
4315       SmallVector<Value *, 4> Indices;
4316       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4317         VPValue *Operand = Operands.getOperand(I);
4318         if (IsIndexLoopInvariant[I - 1])
4319           Indices.push_back(State.get(Operand, {0, 0}));
4320         else
4321           Indices.push_back(State.get(Operand, Part));
4322       }
4323 
4324       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4325       // but it should be a vector, otherwise.
4326       auto *NewGEP =
4327           GEP->isInBounds()
4328               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4329                                           Indices)
4330               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4331       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4332              "NewGEP is not a pointer vector");
4333       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4334       addMetadata(NewGEP, GEP);
4335     }
4336   }
4337 }
4338 
4339 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4340                                               ElementCount VF) {
4341   assert(!VF.Scalable && "scalable vectors not yet supported.");
4342   PHINode *P = cast<PHINode>(PN);
4343   if (EnableVPlanNativePath) {
4344     // Currently we enter here in the VPlan-native path for non-induction
4345     // PHIs where all control flow is uniform. We simply widen these PHIs.
4346     // Create a vector phi with no operands - the vector phi operands will be
4347     // set at the end of vector code generation.
4348     Type *VecTy =
4349         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4350     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4351     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4352     OrigPHIsToFix.push_back(P);
4353 
4354     return;
4355   }
4356 
4357   assert(PN->getParent() == OrigLoop->getHeader() &&
4358          "Non-header phis should have been handled elsewhere");
4359 
4360   // In order to support recurrences we need to be able to vectorize Phi nodes.
4361   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4362   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4363   // this value when we vectorize all of the instructions that use the PHI.
4364   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4365     for (unsigned Part = 0; Part < UF; ++Part) {
4366       // This is phase one of vectorizing PHIs.
4367       bool ScalarPHI =
4368           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4369       Type *VecTy =
4370           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4371       Value *EntryPart = PHINode::Create(
4372           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4373       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4374     }
4375     return;
4376   }
4377 
4378   setDebugLocFromInst(Builder, P);
4379 
4380   // This PHINode must be an induction variable.
4381   // Make sure that we know about it.
4382   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4383 
4384   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4385   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4386 
4387   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4388   // which can be found from the original scalar operations.
4389   switch (II.getKind()) {
4390   case InductionDescriptor::IK_NoInduction:
4391     llvm_unreachable("Unknown induction");
4392   case InductionDescriptor::IK_IntInduction:
4393   case InductionDescriptor::IK_FpInduction:
4394     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4395   case InductionDescriptor::IK_PtrInduction: {
4396     // Handle the pointer induction variable case.
4397     assert(P->getType()->isPointerTy() && "Unexpected type.");
4398 
4399     if (Cost->isScalarAfterVectorization(P, VF)) {
4400       // This is the normalized GEP that starts counting at zero.
4401       Value *PtrInd =
4402           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4403       // Determine the number of scalars we need to generate for each unroll
4404       // iteration. If the instruction is uniform, we only need to generate the
4405       // first lane. Otherwise, we generate all VF values.
4406       unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.Min;
4407       for (unsigned Part = 0; Part < UF; ++Part) {
4408         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4409           Constant *Idx =
4410               ConstantInt::get(PtrInd->getType(), Lane + Part * VF.Min);
4411           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4412           Value *SclrGep =
4413               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4414           SclrGep->setName("next.gep");
4415           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4416         }
4417       }
4418       return;
4419     }
4420     assert(isa<SCEVConstant>(II.getStep()) &&
4421            "Induction step not a SCEV constant!");
4422     Type *PhiType = II.getStep()->getType();
4423 
4424     // Build a pointer phi
4425     Value *ScalarStartValue = II.getStartValue();
4426     Type *ScStValueType = ScalarStartValue->getType();
4427     PHINode *NewPointerPhi =
4428         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4429     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4430 
4431     // A pointer induction, performed by using a gep
4432     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4433     Instruction *InductionLoc = LoopLatch->getTerminator();
4434     const SCEV *ScalarStep = II.getStep();
4435     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4436     Value *ScalarStepValue =
4437         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4438     Value *InductionGEP = GetElementPtrInst::Create(
4439         ScStValueType->getPointerElementType(), NewPointerPhi,
4440         Builder.CreateMul(ScalarStepValue,
4441                           ConstantInt::get(PhiType, VF.Min * UF)),
4442         "ptr.ind", InductionLoc);
4443     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4444 
4445     // Create UF many actual address geps that use the pointer
4446     // phi as base and a vectorized version of the step value
4447     // (<step*0, ..., step*N>) as offset.
4448     for (unsigned Part = 0; Part < UF; ++Part) {
4449       SmallVector<Constant *, 8> Indices;
4450       // Create a vector of consecutive numbers from zero to VF.
4451       for (unsigned i = 0; i < VF.Min; ++i)
4452         Indices.push_back(ConstantInt::get(PhiType, i + Part * VF.Min));
4453       Constant *StartOffset = ConstantVector::get(Indices);
4454 
4455       Value *GEP = Builder.CreateGEP(
4456           ScStValueType->getPointerElementType(), NewPointerPhi,
4457           Builder.CreateMul(StartOffset,
4458                             Builder.CreateVectorSplat(VF.Min, ScalarStepValue),
4459                             "vector.gep"));
4460       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4461     }
4462   }
4463   }
4464 }
4465 
4466 /// A helper function for checking whether an integer division-related
4467 /// instruction may divide by zero (in which case it must be predicated if
4468 /// executed conditionally in the scalar code).
4469 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4470 /// Non-zero divisors that are non compile-time constants will not be
4471 /// converted into multiplication, so we will still end up scalarizing
4472 /// the division, but can do so w/o predication.
4473 static bool mayDivideByZero(Instruction &I) {
4474   assert((I.getOpcode() == Instruction::UDiv ||
4475           I.getOpcode() == Instruction::SDiv ||
4476           I.getOpcode() == Instruction::URem ||
4477           I.getOpcode() == Instruction::SRem) &&
4478          "Unexpected instruction");
4479   Value *Divisor = I.getOperand(1);
4480   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4481   return !CInt || CInt->isZero();
4482 }
4483 
4484 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
4485                                            VPTransformState &State) {
4486   assert(!VF.Scalable && "scalable vectors not yet supported.");
4487   switch (I.getOpcode()) {
4488   case Instruction::Call:
4489   case Instruction::Br:
4490   case Instruction::PHI:
4491   case Instruction::GetElementPtr:
4492   case Instruction::Select:
4493     llvm_unreachable("This instruction is handled by a different recipe.");
4494   case Instruction::UDiv:
4495   case Instruction::SDiv:
4496   case Instruction::SRem:
4497   case Instruction::URem:
4498   case Instruction::Add:
4499   case Instruction::FAdd:
4500   case Instruction::Sub:
4501   case Instruction::FSub:
4502   case Instruction::FNeg:
4503   case Instruction::Mul:
4504   case Instruction::FMul:
4505   case Instruction::FDiv:
4506   case Instruction::FRem:
4507   case Instruction::Shl:
4508   case Instruction::LShr:
4509   case Instruction::AShr:
4510   case Instruction::And:
4511   case Instruction::Or:
4512   case Instruction::Xor: {
4513     // Just widen unops and binops.
4514     setDebugLocFromInst(Builder, &I);
4515 
4516     for (unsigned Part = 0; Part < UF; ++Part) {
4517       SmallVector<Value *, 2> Ops;
4518       for (VPValue *VPOp : User.operands())
4519         Ops.push_back(State.get(VPOp, Part));
4520 
4521       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4522 
4523       if (auto *VecOp = dyn_cast<Instruction>(V))
4524         VecOp->copyIRFlags(&I);
4525 
4526       // Use this vector value for all users of the original instruction.
4527       VectorLoopValueMap.setVectorValue(&I, Part, V);
4528       addMetadata(V, &I);
4529     }
4530 
4531     break;
4532   }
4533   case Instruction::ICmp:
4534   case Instruction::FCmp: {
4535     // Widen compares. Generate vector compares.
4536     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4537     auto *Cmp = cast<CmpInst>(&I);
4538     setDebugLocFromInst(Builder, Cmp);
4539     for (unsigned Part = 0; Part < UF; ++Part) {
4540       Value *A = State.get(User.getOperand(0), Part);
4541       Value *B = State.get(User.getOperand(1), Part);
4542       Value *C = nullptr;
4543       if (FCmp) {
4544         // Propagate fast math flags.
4545         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4546         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4547         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4548       } else {
4549         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4550       }
4551       VectorLoopValueMap.setVectorValue(&I, Part, C);
4552       addMetadata(C, &I);
4553     }
4554 
4555     break;
4556   }
4557 
4558   case Instruction::ZExt:
4559   case Instruction::SExt:
4560   case Instruction::FPToUI:
4561   case Instruction::FPToSI:
4562   case Instruction::FPExt:
4563   case Instruction::PtrToInt:
4564   case Instruction::IntToPtr:
4565   case Instruction::SIToFP:
4566   case Instruction::UIToFP:
4567   case Instruction::Trunc:
4568   case Instruction::FPTrunc:
4569   case Instruction::BitCast: {
4570     auto *CI = cast<CastInst>(&I);
4571     setDebugLocFromInst(Builder, CI);
4572 
4573     /// Vectorize casts.
4574     assert(!VF.Scalable && "VF is assumed to be non scalable.");
4575     Type *DestTy =
4576         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4577 
4578     for (unsigned Part = 0; Part < UF; ++Part) {
4579       Value *A = State.get(User.getOperand(0), Part);
4580       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4581       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4582       addMetadata(Cast, &I);
4583     }
4584     break;
4585   }
4586   default:
4587     // This instruction is not vectorized by simple widening.
4588     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4589     llvm_unreachable("Unhandled instruction!");
4590   } // end of switch.
4591 }
4592 
4593 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4594                                                VPTransformState &State) {
4595   assert(!isa<DbgInfoIntrinsic>(I) &&
4596          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4597   setDebugLocFromInst(Builder, &I);
4598 
4599   Module *M = I.getParent()->getParent()->getParent();
4600   auto *CI = cast<CallInst>(&I);
4601 
4602   SmallVector<Type *, 4> Tys;
4603   for (Value *ArgOperand : CI->arg_operands())
4604     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.Min));
4605 
4606   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4607 
4608   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4609   // version of the instruction.
4610   // Is it beneficial to perform intrinsic call compared to lib call?
4611   bool NeedToScalarize = false;
4612   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4613   bool UseVectorIntrinsic =
4614       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4615   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4616          "Instruction should be scalarized elsewhere.");
4617 
4618   for (unsigned Part = 0; Part < UF; ++Part) {
4619     SmallVector<Value *, 4> Args;
4620     for (auto &I : enumerate(ArgOperands.operands())) {
4621       // Some intrinsics have a scalar argument - don't replace it with a
4622       // vector.
4623       Value *Arg;
4624       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4625         Arg = State.get(I.value(), Part);
4626       else
4627         Arg = State.get(I.value(), {0, 0});
4628       Args.push_back(Arg);
4629     }
4630 
4631     Function *VectorF;
4632     if (UseVectorIntrinsic) {
4633       // Use vector version of the intrinsic.
4634       Type *TysForDecl[] = {CI->getType()};
4635       if (VF.isVector()) {
4636         assert(!VF.Scalable && "VF is assumed to be non scalable.");
4637         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4638       }
4639       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4640       assert(VectorF && "Can't retrieve vector intrinsic.");
4641     } else {
4642       // Use vector version of the function call.
4643       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4644 #ifndef NDEBUG
4645       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4646              "Can't create vector function.");
4647 #endif
4648         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4649     }
4650       SmallVector<OperandBundleDef, 1> OpBundles;
4651       CI->getOperandBundlesAsDefs(OpBundles);
4652       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4653 
4654       if (isa<FPMathOperator>(V))
4655         V->copyFastMathFlags(CI);
4656 
4657       VectorLoopValueMap.setVectorValue(&I, Part, V);
4658       addMetadata(V, &I);
4659   }
4660 }
4661 
4662 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4663                                                  VPUser &Operands,
4664                                                  bool InvariantCond,
4665                                                  VPTransformState &State) {
4666   setDebugLocFromInst(Builder, &I);
4667 
4668   // The condition can be loop invariant  but still defined inside the
4669   // loop. This means that we can't just use the original 'cond' value.
4670   // We have to take the 'vectorized' value and pick the first lane.
4671   // Instcombine will make this a no-op.
4672   auto *InvarCond =
4673       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4674 
4675   for (unsigned Part = 0; Part < UF; ++Part) {
4676     Value *Cond =
4677         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4678     Value *Op0 = State.get(Operands.getOperand(1), Part);
4679     Value *Op1 = State.get(Operands.getOperand(2), Part);
4680     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4681     VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4682     addMetadata(Sel, &I);
4683   }
4684 }
4685 
4686 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4687   // We should not collect Scalars more than once per VF. Right now, this
4688   // function is called from collectUniformsAndScalars(), which already does
4689   // this check. Collecting Scalars for VF=1 does not make any sense.
4690   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4691          "This function should not be visited twice for the same VF");
4692 
4693   SmallSetVector<Instruction *, 8> Worklist;
4694 
4695   // These sets are used to seed the analysis with pointers used by memory
4696   // accesses that will remain scalar.
4697   SmallSetVector<Instruction *, 8> ScalarPtrs;
4698   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4699   auto *Latch = TheLoop->getLoopLatch();
4700 
4701   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4702   // The pointer operands of loads and stores will be scalar as long as the
4703   // memory access is not a gather or scatter operation. The value operand of a
4704   // store will remain scalar if the store is scalarized.
4705   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4706     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4707     assert(WideningDecision != CM_Unknown &&
4708            "Widening decision should be ready at this moment");
4709     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4710       if (Ptr == Store->getValueOperand())
4711         return WideningDecision == CM_Scalarize;
4712     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4713            "Ptr is neither a value or pointer operand");
4714     return WideningDecision != CM_GatherScatter;
4715   };
4716 
4717   // A helper that returns true if the given value is a bitcast or
4718   // getelementptr instruction contained in the loop.
4719   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4720     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4721             isa<GetElementPtrInst>(V)) &&
4722            !TheLoop->isLoopInvariant(V);
4723   };
4724 
4725   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4726     if (!isa<PHINode>(Ptr) ||
4727         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4728       return false;
4729     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4730     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4731       return false;
4732     return isScalarUse(MemAccess, Ptr);
4733   };
4734 
4735   // A helper that evaluates a memory access's use of a pointer. If the
4736   // pointer is actually the pointer induction of a loop, it is being
4737   // inserted into Worklist. If the use will be a scalar use, and the
4738   // pointer is only used by memory accesses, we place the pointer in
4739   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4740   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4741     if (isScalarPtrInduction(MemAccess, Ptr)) {
4742       Worklist.insert(cast<Instruction>(Ptr));
4743       Instruction *Update = cast<Instruction>(
4744           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4745       Worklist.insert(Update);
4746       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4747                         << "\n");
4748       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4749                         << "\n");
4750       return;
4751     }
4752     // We only care about bitcast and getelementptr instructions contained in
4753     // the loop.
4754     if (!isLoopVaryingBitCastOrGEP(Ptr))
4755       return;
4756 
4757     // If the pointer has already been identified as scalar (e.g., if it was
4758     // also identified as uniform), there's nothing to do.
4759     auto *I = cast<Instruction>(Ptr);
4760     if (Worklist.count(I))
4761       return;
4762 
4763     // If the use of the pointer will be a scalar use, and all users of the
4764     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4765     // place the pointer in PossibleNonScalarPtrs.
4766     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4767           return isa<LoadInst>(U) || isa<StoreInst>(U);
4768         }))
4769       ScalarPtrs.insert(I);
4770     else
4771       PossibleNonScalarPtrs.insert(I);
4772   };
4773 
4774   // We seed the scalars analysis with three classes of instructions: (1)
4775   // instructions marked uniform-after-vectorization and (2) bitcast,
4776   // getelementptr and (pointer) phi instructions used by memory accesses
4777   // requiring a scalar use.
4778   //
4779   // (1) Add to the worklist all instructions that have been identified as
4780   // uniform-after-vectorization.
4781   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4782 
4783   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4784   // memory accesses requiring a scalar use. The pointer operands of loads and
4785   // stores will be scalar as long as the memory accesses is not a gather or
4786   // scatter operation. The value operand of a store will remain scalar if the
4787   // store is scalarized.
4788   for (auto *BB : TheLoop->blocks())
4789     for (auto &I : *BB) {
4790       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4791         evaluatePtrUse(Load, Load->getPointerOperand());
4792       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4793         evaluatePtrUse(Store, Store->getPointerOperand());
4794         evaluatePtrUse(Store, Store->getValueOperand());
4795       }
4796     }
4797   for (auto *I : ScalarPtrs)
4798     if (!PossibleNonScalarPtrs.count(I)) {
4799       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4800       Worklist.insert(I);
4801     }
4802 
4803   // Insert the forced scalars.
4804   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4805   // induction variable when the PHI user is scalarized.
4806   auto ForcedScalar = ForcedScalars.find(VF);
4807   if (ForcedScalar != ForcedScalars.end())
4808     for (auto *I : ForcedScalar->second)
4809       Worklist.insert(I);
4810 
4811   // Expand the worklist by looking through any bitcasts and getelementptr
4812   // instructions we've already identified as scalar. This is similar to the
4813   // expansion step in collectLoopUniforms(); however, here we're only
4814   // expanding to include additional bitcasts and getelementptr instructions.
4815   unsigned Idx = 0;
4816   while (Idx != Worklist.size()) {
4817     Instruction *Dst = Worklist[Idx++];
4818     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4819       continue;
4820     auto *Src = cast<Instruction>(Dst->getOperand(0));
4821     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4822           auto *J = cast<Instruction>(U);
4823           return !TheLoop->contains(J) || Worklist.count(J) ||
4824                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4825                   isScalarUse(J, Src));
4826         })) {
4827       Worklist.insert(Src);
4828       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4829     }
4830   }
4831 
4832   // An induction variable will remain scalar if all users of the induction
4833   // variable and induction variable update remain scalar.
4834   for (auto &Induction : Legal->getInductionVars()) {
4835     auto *Ind = Induction.first;
4836     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4837 
4838     // If tail-folding is applied, the primary induction variable will be used
4839     // to feed a vector compare.
4840     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4841       continue;
4842 
4843     // Determine if all users of the induction variable are scalar after
4844     // vectorization.
4845     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4846       auto *I = cast<Instruction>(U);
4847       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4848     });
4849     if (!ScalarInd)
4850       continue;
4851 
4852     // Determine if all users of the induction variable update instruction are
4853     // scalar after vectorization.
4854     auto ScalarIndUpdate =
4855         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4856           auto *I = cast<Instruction>(U);
4857           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4858         });
4859     if (!ScalarIndUpdate)
4860       continue;
4861 
4862     // The induction variable and its update instruction will remain scalar.
4863     Worklist.insert(Ind);
4864     Worklist.insert(IndUpdate);
4865     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4866     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4867                       << "\n");
4868   }
4869 
4870   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4871 }
4872 
4873 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
4874                                                          ElementCount VF) {
4875   assert(!VF.Scalable && "scalable vectors not yet supported.");
4876   if (!blockNeedsPredication(I->getParent()))
4877     return false;
4878   switch(I->getOpcode()) {
4879   default:
4880     break;
4881   case Instruction::Load:
4882   case Instruction::Store: {
4883     if (!Legal->isMaskRequired(I))
4884       return false;
4885     auto *Ptr = getLoadStorePointerOperand(I);
4886     auto *Ty = getMemInstValueType(I);
4887     // We have already decided how to vectorize this instruction, get that
4888     // result.
4889     if (VF.isVector()) {
4890       InstWidening WideningDecision = getWideningDecision(I, VF);
4891       assert(WideningDecision != CM_Unknown &&
4892              "Widening decision should be ready at this moment");
4893       return WideningDecision == CM_Scalarize;
4894     }
4895     const Align Alignment = getLoadStoreAlignment(I);
4896     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4897                                 isLegalMaskedGather(Ty, Alignment))
4898                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4899                                 isLegalMaskedScatter(Ty, Alignment));
4900   }
4901   case Instruction::UDiv:
4902   case Instruction::SDiv:
4903   case Instruction::SRem:
4904   case Instruction::URem:
4905     return mayDivideByZero(*I);
4906   }
4907   return false;
4908 }
4909 
4910 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4911     Instruction *I, ElementCount VF) {
4912   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4913   assert(getWideningDecision(I, VF) == CM_Unknown &&
4914          "Decision should not be set yet.");
4915   auto *Group = getInterleavedAccessGroup(I);
4916   assert(Group && "Must have a group.");
4917 
4918   // If the instruction's allocated size doesn't equal it's type size, it
4919   // requires padding and will be scalarized.
4920   auto &DL = I->getModule()->getDataLayout();
4921   auto *ScalarTy = getMemInstValueType(I);
4922   if (hasIrregularType(ScalarTy, DL, VF))
4923     return false;
4924 
4925   // Check if masking is required.
4926   // A Group may need masking for one of two reasons: it resides in a block that
4927   // needs predication, or it was decided to use masking to deal with gaps.
4928   bool PredicatedAccessRequiresMasking =
4929       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4930   bool AccessWithGapsRequiresMasking =
4931       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4932   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4933     return true;
4934 
4935   // If masked interleaving is required, we expect that the user/target had
4936   // enabled it, because otherwise it either wouldn't have been created or
4937   // it should have been invalidated by the CostModel.
4938   assert(useMaskedInterleavedAccesses(TTI) &&
4939          "Masked interleave-groups for predicated accesses are not enabled.");
4940 
4941   auto *Ty = getMemInstValueType(I);
4942   const Align Alignment = getLoadStoreAlignment(I);
4943   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4944                           : TTI.isLegalMaskedStore(Ty, Alignment);
4945 }
4946 
4947 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4948     Instruction *I, ElementCount VF) {
4949   // Get and ensure we have a valid memory instruction.
4950   LoadInst *LI = dyn_cast<LoadInst>(I);
4951   StoreInst *SI = dyn_cast<StoreInst>(I);
4952   assert((LI || SI) && "Invalid memory instruction");
4953 
4954   auto *Ptr = getLoadStorePointerOperand(I);
4955 
4956   // In order to be widened, the pointer should be consecutive, first of all.
4957   if (!Legal->isConsecutivePtr(Ptr))
4958     return false;
4959 
4960   // If the instruction is a store located in a predicated block, it will be
4961   // scalarized.
4962   if (isScalarWithPredication(I))
4963     return false;
4964 
4965   // If the instruction's allocated size doesn't equal it's type size, it
4966   // requires padding and will be scalarized.
4967   auto &DL = I->getModule()->getDataLayout();
4968   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4969   if (hasIrregularType(ScalarTy, DL, VF))
4970     return false;
4971 
4972   return true;
4973 }
4974 
4975 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4976   // We should not collect Uniforms more than once per VF. Right now,
4977   // this function is called from collectUniformsAndScalars(), which
4978   // already does this check. Collecting Uniforms for VF=1 does not make any
4979   // sense.
4980 
4981   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4982          "This function should not be visited twice for the same VF");
4983 
4984   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4985   // not analyze again.  Uniforms.count(VF) will return 1.
4986   Uniforms[VF].clear();
4987 
4988   // We now know that the loop is vectorizable!
4989   // Collect instructions inside the loop that will remain uniform after
4990   // vectorization.
4991 
4992   // Global values, params and instructions outside of current loop are out of
4993   // scope.
4994   auto isOutOfScope = [&](Value *V) -> bool {
4995     Instruction *I = dyn_cast<Instruction>(V);
4996     return (!I || !TheLoop->contains(I));
4997   };
4998 
4999   SetVector<Instruction *> Worklist;
5000   BasicBlock *Latch = TheLoop->getLoopLatch();
5001 
5002   // Instructions that are scalar with predication must not be considered
5003   // uniform after vectorization, because that would create an erroneous
5004   // replicating region where only a single instance out of VF should be formed.
5005   // TODO: optimize such seldom cases if found important, see PR40816.
5006   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5007     if (isScalarWithPredication(I, VF)) {
5008       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5009                         << *I << "\n");
5010       return;
5011     }
5012     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5013     Worklist.insert(I);
5014   };
5015 
5016   // Start with the conditional branch. If the branch condition is an
5017   // instruction contained in the loop that is only used by the branch, it is
5018   // uniform.
5019   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5020   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5021     addToWorklistIfAllowed(Cmp);
5022 
5023   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5024   // are pointers that are treated like consecutive pointers during
5025   // vectorization. The pointer operands of interleaved accesses are an
5026   // example.
5027   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
5028 
5029   // Holds pointer operands of instructions that are possibly non-uniform.
5030   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
5031 
5032   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5033     InstWidening WideningDecision = getWideningDecision(I, VF);
5034     assert(WideningDecision != CM_Unknown &&
5035            "Widening decision should be ready at this moment");
5036 
5037     return (WideningDecision == CM_Widen ||
5038             WideningDecision == CM_Widen_Reverse ||
5039             WideningDecision == CM_Interleave);
5040   };
5041   // Iterate over the instructions in the loop, and collect all
5042   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5043   // that a consecutive-like pointer operand will be scalarized, we collect it
5044   // in PossibleNonUniformPtrs instead. We use two sets here because a single
5045   // getelementptr instruction can be used by both vectorized and scalarized
5046   // memory instructions. For example, if a loop loads and stores from the same
5047   // location, but the store is conditional, the store will be scalarized, and
5048   // the getelementptr won't remain uniform.
5049   for (auto *BB : TheLoop->blocks())
5050     for (auto &I : *BB) {
5051       // If there's no pointer operand, there's nothing to do.
5052       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5053       if (!Ptr)
5054         continue;
5055 
5056       // True if all users of Ptr are memory accesses that have Ptr as their
5057       // pointer operand.
5058       auto UsersAreMemAccesses =
5059           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
5060             return getLoadStorePointerOperand(U) == Ptr;
5061           });
5062 
5063       // Ensure the memory instruction will not be scalarized or used by
5064       // gather/scatter, making its pointer operand non-uniform. If the pointer
5065       // operand is used by any instruction other than a memory access, we
5066       // conservatively assume the pointer operand may be non-uniform.
5067       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
5068         PossibleNonUniformPtrs.insert(Ptr);
5069 
5070       // If the memory instruction will be vectorized and its pointer operand
5071       // is consecutive-like, or interleaving - the pointer operand should
5072       // remain uniform.
5073       else
5074         ConsecutiveLikePtrs.insert(Ptr);
5075     }
5076 
5077   // Add to the Worklist all consecutive and consecutive-like pointers that
5078   // aren't also identified as possibly non-uniform.
5079   for (auto *V : ConsecutiveLikePtrs)
5080     if (!PossibleNonUniformPtrs.count(V))
5081       addToWorklistIfAllowed(V);
5082 
5083   // Expand Worklist in topological order: whenever a new instruction
5084   // is added , its users should be already inside Worklist.  It ensures
5085   // a uniform instruction will only be used by uniform instructions.
5086   unsigned idx = 0;
5087   while (idx != Worklist.size()) {
5088     Instruction *I = Worklist[idx++];
5089 
5090     for (auto OV : I->operand_values()) {
5091       // isOutOfScope operands cannot be uniform instructions.
5092       if (isOutOfScope(OV))
5093         continue;
5094       // First order recurrence Phi's should typically be considered
5095       // non-uniform.
5096       auto *OP = dyn_cast<PHINode>(OV);
5097       if (OP && Legal->isFirstOrderRecurrence(OP))
5098         continue;
5099       // If all the users of the operand are uniform, then add the
5100       // operand into the uniform worklist.
5101       auto *OI = cast<Instruction>(OV);
5102       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5103             auto *J = cast<Instruction>(U);
5104             return Worklist.count(J) ||
5105                    (OI == getLoadStorePointerOperand(J) &&
5106                     isUniformDecision(J, VF));
5107           }))
5108         addToWorklistIfAllowed(OI);
5109     }
5110   }
5111 
5112   // Returns true if Ptr is the pointer operand of a memory access instruction
5113   // I, and I is known to not require scalarization.
5114   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5115     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5116   };
5117 
5118   // For an instruction to be added into Worklist above, all its users inside
5119   // the loop should also be in Worklist. However, this condition cannot be
5120   // true for phi nodes that form a cyclic dependence. We must process phi
5121   // nodes separately. An induction variable will remain uniform if all users
5122   // of the induction variable and induction variable update remain uniform.
5123   // The code below handles both pointer and non-pointer induction variables.
5124   for (auto &Induction : Legal->getInductionVars()) {
5125     auto *Ind = Induction.first;
5126     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5127 
5128     // Determine if all users of the induction variable are uniform after
5129     // vectorization.
5130     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5131       auto *I = cast<Instruction>(U);
5132       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5133              isVectorizedMemAccessUse(I, Ind);
5134     });
5135     if (!UniformInd)
5136       continue;
5137 
5138     // Determine if all users of the induction variable update instruction are
5139     // uniform after vectorization.
5140     auto UniformIndUpdate =
5141         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5142           auto *I = cast<Instruction>(U);
5143           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5144                  isVectorizedMemAccessUse(I, IndUpdate);
5145         });
5146     if (!UniformIndUpdate)
5147       continue;
5148 
5149     // The induction variable and its update instruction will remain uniform.
5150     addToWorklistIfAllowed(Ind);
5151     addToWorklistIfAllowed(IndUpdate);
5152   }
5153 
5154   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5155 }
5156 
5157 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5158   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5159 
5160   if (Legal->getRuntimePointerChecking()->Need) {
5161     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5162         "runtime pointer checks needed. Enable vectorization of this "
5163         "loop with '#pragma clang loop vectorize(enable)' when "
5164         "compiling with -Os/-Oz",
5165         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5166     return true;
5167   }
5168 
5169   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5170     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5171         "runtime SCEV checks needed. Enable vectorization of this "
5172         "loop with '#pragma clang loop vectorize(enable)' when "
5173         "compiling with -Os/-Oz",
5174         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5175     return true;
5176   }
5177 
5178   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5179   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5180     reportVectorizationFailure("Runtime stride check for small trip count",
5181         "runtime stride == 1 checks needed. Enable vectorization of "
5182         "this loop without such check by compiling with -Os/-Oz",
5183         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5184     return true;
5185   }
5186 
5187   return false;
5188 }
5189 
5190 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
5191                                                             unsigned UserIC) {
5192   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5193     // TODO: It may by useful to do since it's still likely to be dynamically
5194     // uniform if the target can skip.
5195     reportVectorizationFailure(
5196         "Not inserting runtime ptr check for divergent target",
5197         "runtime pointer checks needed. Not enabled for divergent target",
5198         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5199     return None;
5200   }
5201 
5202   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5203   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5204   if (TC == 1) {
5205     reportVectorizationFailure("Single iteration (non) loop",
5206         "loop trip count is one, irrelevant for vectorization",
5207         "SingleIterationLoop", ORE, TheLoop);
5208     return None;
5209   }
5210 
5211   switch (ScalarEpilogueStatus) {
5212   case CM_ScalarEpilogueAllowed:
5213     return UserVF ? UserVF : computeFeasibleMaxVF(TC);
5214   case CM_ScalarEpilogueNotNeededUsePredicate:
5215     LLVM_DEBUG(
5216         dbgs() << "LV: vector predicate hint/switch found.\n"
5217                << "LV: Not allowing scalar epilogue, creating predicated "
5218                << "vector loop.\n");
5219     break;
5220   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5221     // fallthrough as a special case of OptForSize
5222   case CM_ScalarEpilogueNotAllowedOptSize:
5223     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5224       LLVM_DEBUG(
5225           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5226     else
5227       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5228                         << "count.\n");
5229 
5230     // Bail if runtime checks are required, which are not good when optimising
5231     // for size.
5232     if (runtimeChecksRequired())
5233       return None;
5234     break;
5235   }
5236 
5237   // Now try the tail folding
5238 
5239   // Invalidate interleave groups that require an epilogue if we can't mask
5240   // the interleave-group.
5241   if (!useMaskedInterleavedAccesses(TTI)) {
5242     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5243            "No decisions should have been taken at this point");
5244     // Note: There is no need to invalidate any cost modeling decisions here, as
5245     // non where taken so far.
5246     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5247   }
5248 
5249   unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
5250   assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
5251   unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
5252   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5253     // Accept MaxVF if we do not have a tail.
5254     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5255     return MaxVF;
5256   }
5257 
5258   // If we don't know the precise trip count, or if the trip count that we
5259   // found modulo the vectorization factor is not zero, try to fold the tail
5260   // by masking.
5261   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5262   if (Legal->prepareToFoldTailByMasking()) {
5263     FoldTailByMasking = true;
5264     return MaxVF;
5265   }
5266 
5267   // If there was a tail-folding hint/switch, but we can't fold the tail by
5268   // masking, fallback to a vectorization with a scalar epilogue.
5269   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5270     if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) {
5271       LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5272       return None;
5273     }
5274     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5275                          "scalar epilogue instead.\n");
5276     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5277     return MaxVF;
5278   }
5279 
5280   if (TC == 0) {
5281     reportVectorizationFailure(
5282         "Unable to calculate the loop count due to complex control flow",
5283         "unable to calculate the loop count due to complex control flow",
5284         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5285     return None;
5286   }
5287 
5288   reportVectorizationFailure(
5289       "Cannot optimize for size and vectorize at the same time.",
5290       "cannot optimize for size and vectorize at the same time. "
5291       "Enable vectorization of this loop with '#pragma clang loop "
5292       "vectorize(enable)' when compiling with -Os/-Oz",
5293       "NoTailLoopWithOptForSize", ORE, TheLoop);
5294   return None;
5295 }
5296 
5297 unsigned
5298 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5299   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5300   unsigned SmallestType, WidestType;
5301   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5302   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5303 
5304   // Get the maximum safe dependence distance in bits computed by LAA.
5305   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5306   // the memory accesses that is most restrictive (involved in the smallest
5307   // dependence distance).
5308   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5309 
5310   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5311 
5312   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5313   // Note that both WidestRegister and WidestType may not be a powers of 2.
5314   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5315 
5316   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5317                     << " / " << WidestType << " bits.\n");
5318   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5319                     << WidestRegister << " bits.\n");
5320 
5321   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5322                                  " into one vector!");
5323   if (MaxVectorSize == 0) {
5324     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5325     MaxVectorSize = 1;
5326     return MaxVectorSize;
5327   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5328              isPowerOf2_32(ConstTripCount)) {
5329     // We need to clamp the VF to be the ConstTripCount. There is no point in
5330     // choosing a higher viable VF as done in the loop below.
5331     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5332                       << ConstTripCount << "\n");
5333     MaxVectorSize = ConstTripCount;
5334     return MaxVectorSize;
5335   }
5336 
5337   unsigned MaxVF = MaxVectorSize;
5338   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5339       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5340     // Collect all viable vectorization factors larger than the default MaxVF
5341     // (i.e. MaxVectorSize).
5342     SmallVector<ElementCount, 8> VFs;
5343     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5344     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5345       VFs.push_back(ElementCount::getFixed(VS));
5346 
5347     // For each VF calculate its register usage.
5348     auto RUs = calculateRegisterUsage(VFs);
5349 
5350     // Select the largest VF which doesn't require more registers than existing
5351     // ones.
5352     for (int i = RUs.size() - 1; i >= 0; --i) {
5353       bool Selected = true;
5354       for (auto& pair : RUs[i].MaxLocalUsers) {
5355         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5356         if (pair.second > TargetNumRegisters)
5357           Selected = false;
5358       }
5359       if (Selected) {
5360         MaxVF = VFs[i].Min;
5361         break;
5362       }
5363     }
5364     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5365       if (MaxVF < MinVF) {
5366         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5367                           << ") with target's minimum: " << MinVF << '\n');
5368         MaxVF = MinVF;
5369       }
5370     }
5371   }
5372   return MaxVF;
5373 }
5374 
5375 VectorizationFactor
5376 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5377   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5378   const float ScalarCost = Cost;
5379   unsigned Width = 1;
5380   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5381 
5382   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5383   if (ForceVectorization && MaxVF > 1) {
5384     // Ignore scalar width, because the user explicitly wants vectorization.
5385     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5386     // evaluation.
5387     Cost = std::numeric_limits<float>::max();
5388   }
5389 
5390   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5391     // Notice that the vector loop needs to be executed less times, so
5392     // we need to divide the cost of the vector loops by the width of
5393     // the vector elements.
5394     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5395     float VectorCost = C.first / (float)i;
5396     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5397                       << " costs: " << (int)VectorCost << ".\n");
5398     if (!C.second && !ForceVectorization) {
5399       LLVM_DEBUG(
5400           dbgs() << "LV: Not considering vector loop of width " << i
5401                  << " because it will not generate any vector instructions.\n");
5402       continue;
5403     }
5404     if (VectorCost < Cost) {
5405       Cost = VectorCost;
5406       Width = i;
5407     }
5408   }
5409 
5410   if (!EnableCondStoresVectorization && NumPredStores) {
5411     reportVectorizationFailure("There are conditional stores.",
5412         "store that is conditionally executed prevents vectorization",
5413         "ConditionalStore", ORE, TheLoop);
5414     Width = 1;
5415     Cost = ScalarCost;
5416   }
5417 
5418   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5419              << "LV: Vectorization seems to be not beneficial, "
5420              << "but was forced by a user.\n");
5421   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5422   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5423                                 (unsigned)(Width * Cost)};
5424   return Factor;
5425 }
5426 
5427 std::pair<unsigned, unsigned>
5428 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5429   unsigned MinWidth = -1U;
5430   unsigned MaxWidth = 8;
5431   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5432 
5433   // For each block.
5434   for (BasicBlock *BB : TheLoop->blocks()) {
5435     // For each instruction in the loop.
5436     for (Instruction &I : BB->instructionsWithoutDebug()) {
5437       Type *T = I.getType();
5438 
5439       // Skip ignored values.
5440       if (ValuesToIgnore.count(&I))
5441         continue;
5442 
5443       // Only examine Loads, Stores and PHINodes.
5444       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5445         continue;
5446 
5447       // Examine PHI nodes that are reduction variables. Update the type to
5448       // account for the recurrence type.
5449       if (auto *PN = dyn_cast<PHINode>(&I)) {
5450         if (!Legal->isReductionVariable(PN))
5451           continue;
5452         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5453         T = RdxDesc.getRecurrenceType();
5454       }
5455 
5456       // Examine the stored values.
5457       if (auto *ST = dyn_cast<StoreInst>(&I))
5458         T = ST->getValueOperand()->getType();
5459 
5460       // Ignore loaded pointer types and stored pointer types that are not
5461       // vectorizable.
5462       //
5463       // FIXME: The check here attempts to predict whether a load or store will
5464       //        be vectorized. We only know this for certain after a VF has
5465       //        been selected. Here, we assume that if an access can be
5466       //        vectorized, it will be. We should also look at extending this
5467       //        optimization to non-pointer types.
5468       //
5469       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5470           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5471         continue;
5472 
5473       MinWidth = std::min(MinWidth,
5474                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5475       MaxWidth = std::max(MaxWidth,
5476                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5477     }
5478   }
5479 
5480   return {MinWidth, MaxWidth};
5481 }
5482 
5483 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5484                                                            unsigned LoopCost) {
5485   // -- The interleave heuristics --
5486   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5487   // There are many micro-architectural considerations that we can't predict
5488   // at this level. For example, frontend pressure (on decode or fetch) due to
5489   // code size, or the number and capabilities of the execution ports.
5490   //
5491   // We use the following heuristics to select the interleave count:
5492   // 1. If the code has reductions, then we interleave to break the cross
5493   // iteration dependency.
5494   // 2. If the loop is really small, then we interleave to reduce the loop
5495   // overhead.
5496   // 3. We don't interleave if we think that we will spill registers to memory
5497   // due to the increased register pressure.
5498 
5499   if (!isScalarEpilogueAllowed())
5500     return 1;
5501 
5502   // We used the distance for the interleave count.
5503   if (Legal->getMaxSafeDepDistBytes() != -1U)
5504     return 1;
5505 
5506   // Do not interleave loops with a relatively small known or estimated trip
5507   // count.
5508   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5509   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5510     return 1;
5511 
5512   RegisterUsage R = calculateRegisterUsage({VF})[0];
5513   // We divide by these constants so assume that we have at least one
5514   // instruction that uses at least one register.
5515   for (auto& pair : R.MaxLocalUsers) {
5516     pair.second = std::max(pair.second, 1U);
5517   }
5518 
5519   // We calculate the interleave count using the following formula.
5520   // Subtract the number of loop invariants from the number of available
5521   // registers. These registers are used by all of the interleaved instances.
5522   // Next, divide the remaining registers by the number of registers that is
5523   // required by the loop, in order to estimate how many parallel instances
5524   // fit without causing spills. All of this is rounded down if necessary to be
5525   // a power of two. We want power of two interleave count to simplify any
5526   // addressing operations or alignment considerations.
5527   // We also want power of two interleave counts to ensure that the induction
5528   // variable of the vector loop wraps to zero, when tail is folded by masking;
5529   // this currently happens when OptForSize, in which case IC is set to 1 above.
5530   unsigned IC = UINT_MAX;
5531 
5532   for (auto& pair : R.MaxLocalUsers) {
5533     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5534     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5535                       << " registers of "
5536                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5537     if (VF == 1) {
5538       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5539         TargetNumRegisters = ForceTargetNumScalarRegs;
5540     } else {
5541       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5542         TargetNumRegisters = ForceTargetNumVectorRegs;
5543     }
5544     unsigned MaxLocalUsers = pair.second;
5545     unsigned LoopInvariantRegs = 0;
5546     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5547       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5548 
5549     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5550     // Don't count the induction variable as interleaved.
5551     if (EnableIndVarRegisterHeur) {
5552       TmpIC =
5553           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5554                         std::max(1U, (MaxLocalUsers - 1)));
5555     }
5556 
5557     IC = std::min(IC, TmpIC);
5558   }
5559 
5560   // Clamp the interleave ranges to reasonable counts.
5561   assert(!VF.Scalable && "scalable vectors not yet supported.");
5562   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF.Min);
5563 
5564   // Check if the user has overridden the max.
5565   if (VF == 1) {
5566     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5567       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5568   } else {
5569     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5570       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5571   }
5572 
5573   // If trip count is known or estimated compile time constant, limit the
5574   // interleave count to be less than the trip count divided by VF.
5575   if (BestKnownTC) {
5576     MaxInterleaveCount = std::min(*BestKnownTC / VF.Min, MaxInterleaveCount);
5577   }
5578 
5579   // If we did not calculate the cost for VF (because the user selected the VF)
5580   // then we calculate the cost of VF here.
5581   if (LoopCost == 0)
5582     LoopCost = expectedCost(VF).first;
5583 
5584   assert(LoopCost && "Non-zero loop cost expected");
5585 
5586   // Clamp the calculated IC to be between the 1 and the max interleave count
5587   // that the target and trip count allows.
5588   if (IC > MaxInterleaveCount)
5589     IC = MaxInterleaveCount;
5590   else if (IC < 1)
5591     IC = 1;
5592 
5593   // Interleave if we vectorized this loop and there is a reduction that could
5594   // benefit from interleaving.
5595   if (VF.isVector() && !Legal->getReductionVars().empty()) {
5596     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5597     return IC;
5598   }
5599 
5600   // Note that if we've already vectorized the loop we will have done the
5601   // runtime check and so interleaving won't require further checks.
5602   bool InterleavingRequiresRuntimePointerCheck =
5603       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5604 
5605   // We want to interleave small loops in order to reduce the loop overhead and
5606   // potentially expose ILP opportunities.
5607   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5608   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5609     // We assume that the cost overhead is 1 and we use the cost model
5610     // to estimate the cost of the loop and interleave until the cost of the
5611     // loop overhead is about 5% of the cost of the loop.
5612     unsigned SmallIC =
5613         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5614 
5615     // Interleave until store/load ports (estimated by max interleave count) are
5616     // saturated.
5617     unsigned NumStores = Legal->getNumStores();
5618     unsigned NumLoads = Legal->getNumLoads();
5619     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5620     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5621 
5622     // If we have a scalar reduction (vector reductions are already dealt with
5623     // by this point), we can increase the critical path length if the loop
5624     // we're interleaving is inside another loop. Limit, by default to 2, so the
5625     // critical path only gets increased by one reduction operation.
5626     if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
5627       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5628       SmallIC = std::min(SmallIC, F);
5629       StoresIC = std::min(StoresIC, F);
5630       LoadsIC = std::min(LoadsIC, F);
5631     }
5632 
5633     if (EnableLoadStoreRuntimeInterleave &&
5634         std::max(StoresIC, LoadsIC) > SmallIC) {
5635       LLVM_DEBUG(
5636           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5637       return std::max(StoresIC, LoadsIC);
5638     }
5639 
5640     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5641     return SmallIC;
5642   }
5643 
5644   // Interleave if this is a large loop (small loops are already dealt with by
5645   // this point) that could benefit from interleaving.
5646   bool HasReductions = !Legal->getReductionVars().empty();
5647   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5648     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5649     return IC;
5650   }
5651 
5652   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5653   return 1;
5654 }
5655 
5656 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5657 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5658   // This function calculates the register usage by measuring the highest number
5659   // of values that are alive at a single location. Obviously, this is a very
5660   // rough estimation. We scan the loop in a topological order in order and
5661   // assign a number to each instruction. We use RPO to ensure that defs are
5662   // met before their users. We assume that each instruction that has in-loop
5663   // users starts an interval. We record every time that an in-loop value is
5664   // used, so we have a list of the first and last occurrences of each
5665   // instruction. Next, we transpose this data structure into a multi map that
5666   // holds the list of intervals that *end* at a specific location. This multi
5667   // map allows us to perform a linear search. We scan the instructions linearly
5668   // and record each time that a new interval starts, by placing it in a set.
5669   // If we find this value in the multi-map then we remove it from the set.
5670   // The max register usage is the maximum size of the set.
5671   // We also search for instructions that are defined outside the loop, but are
5672   // used inside the loop. We need this number separately from the max-interval
5673   // usage number because when we unroll, loop-invariant values do not take
5674   // more register.
5675   LoopBlocksDFS DFS(TheLoop);
5676   DFS.perform(LI);
5677 
5678   RegisterUsage RU;
5679 
5680   // Each 'key' in the map opens a new interval. The values
5681   // of the map are the index of the 'last seen' usage of the
5682   // instruction that is the key.
5683   using IntervalMap = DenseMap<Instruction *, unsigned>;
5684 
5685   // Maps instruction to its index.
5686   SmallVector<Instruction *, 64> IdxToInstr;
5687   // Marks the end of each interval.
5688   IntervalMap EndPoint;
5689   // Saves the list of instruction indices that are used in the loop.
5690   SmallPtrSet<Instruction *, 8> Ends;
5691   // Saves the list of values that are used in the loop but are
5692   // defined outside the loop, such as arguments and constants.
5693   SmallPtrSet<Value *, 8> LoopInvariants;
5694 
5695   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5696     for (Instruction &I : BB->instructionsWithoutDebug()) {
5697       IdxToInstr.push_back(&I);
5698 
5699       // Save the end location of each USE.
5700       for (Value *U : I.operands()) {
5701         auto *Instr = dyn_cast<Instruction>(U);
5702 
5703         // Ignore non-instruction values such as arguments, constants, etc.
5704         if (!Instr)
5705           continue;
5706 
5707         // If this instruction is outside the loop then record it and continue.
5708         if (!TheLoop->contains(Instr)) {
5709           LoopInvariants.insert(Instr);
5710           continue;
5711         }
5712 
5713         // Overwrite previous end points.
5714         EndPoint[Instr] = IdxToInstr.size();
5715         Ends.insert(Instr);
5716       }
5717     }
5718   }
5719 
5720   // Saves the list of intervals that end with the index in 'key'.
5721   using InstrList = SmallVector<Instruction *, 2>;
5722   DenseMap<unsigned, InstrList> TransposeEnds;
5723 
5724   // Transpose the EndPoints to a list of values that end at each index.
5725   for (auto &Interval : EndPoint)
5726     TransposeEnds[Interval.second].push_back(Interval.first);
5727 
5728   SmallPtrSet<Instruction *, 8> OpenIntervals;
5729 
5730   // Get the size of the widest register.
5731   unsigned MaxSafeDepDist = -1U;
5732   if (Legal->getMaxSafeDepDistBytes() != -1U)
5733     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5734   unsigned WidestRegister =
5735       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5736   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5737 
5738   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5739   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5740 
5741   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5742 
5743   // A lambda that gets the register usage for the given type and VF.
5744   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) {
5745     if (Ty->isTokenTy())
5746       return 0U;
5747     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5748     assert(!VF.Scalable && "scalable vectors not yet supported.");
5749     return std::max<unsigned>(1, VF.Min * TypeSize / WidestRegister);
5750   };
5751 
5752   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5753     Instruction *I = IdxToInstr[i];
5754 
5755     // Remove all of the instructions that end at this location.
5756     InstrList &List = TransposeEnds[i];
5757     for (Instruction *ToRemove : List)
5758       OpenIntervals.erase(ToRemove);
5759 
5760     // Ignore instructions that are never used within the loop.
5761     if (!Ends.count(I))
5762       continue;
5763 
5764     // Skip ignored values.
5765     if (ValuesToIgnore.count(I))
5766       continue;
5767 
5768     // For each VF find the maximum usage of registers.
5769     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5770       // Count the number of live intervals.
5771       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5772 
5773       if (VFs[j].isScalar()) {
5774         for (auto Inst : OpenIntervals) {
5775           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5776           if (RegUsage.find(ClassID) == RegUsage.end())
5777             RegUsage[ClassID] = 1;
5778           else
5779             RegUsage[ClassID] += 1;
5780         }
5781       } else {
5782         collectUniformsAndScalars(VFs[j]);
5783         for (auto Inst : OpenIntervals) {
5784           // Skip ignored values for VF > 1.
5785           if (VecValuesToIgnore.count(Inst))
5786             continue;
5787           if (isScalarAfterVectorization(Inst, VFs[j])) {
5788             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5789             if (RegUsage.find(ClassID) == RegUsage.end())
5790               RegUsage[ClassID] = 1;
5791             else
5792               RegUsage[ClassID] += 1;
5793           } else {
5794             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5795             if (RegUsage.find(ClassID) == RegUsage.end())
5796               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5797             else
5798               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5799           }
5800         }
5801       }
5802 
5803       for (auto& pair : RegUsage) {
5804         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5805           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5806         else
5807           MaxUsages[j][pair.first] = pair.second;
5808       }
5809     }
5810 
5811     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5812                       << OpenIntervals.size() << '\n');
5813 
5814     // Add the current instruction to the list of open intervals.
5815     OpenIntervals.insert(I);
5816   }
5817 
5818   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5819     SmallMapVector<unsigned, unsigned, 4> Invariant;
5820 
5821     for (auto Inst : LoopInvariants) {
5822       unsigned Usage =
5823           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5824       unsigned ClassID =
5825           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
5826       if (Invariant.find(ClassID) == Invariant.end())
5827         Invariant[ClassID] = Usage;
5828       else
5829         Invariant[ClassID] += Usage;
5830     }
5831 
5832     LLVM_DEBUG({
5833       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5834       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5835              << " item\n";
5836       for (const auto &pair : MaxUsages[i]) {
5837         dbgs() << "LV(REG): RegisterClass: "
5838                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5839                << " registers\n";
5840       }
5841       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5842              << " item\n";
5843       for (const auto &pair : Invariant) {
5844         dbgs() << "LV(REG): RegisterClass: "
5845                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5846                << " registers\n";
5847       }
5848     });
5849 
5850     RU.LoopInvariantRegs = Invariant;
5851     RU.MaxLocalUsers = MaxUsages[i];
5852     RUs[i] = RU;
5853   }
5854 
5855   return RUs;
5856 }
5857 
5858 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5859   // TODO: Cost model for emulated masked load/store is completely
5860   // broken. This hack guides the cost model to use an artificially
5861   // high enough value to practically disable vectorization with such
5862   // operations, except where previously deployed legality hack allowed
5863   // using very low cost values. This is to avoid regressions coming simply
5864   // from moving "masked load/store" check from legality to cost model.
5865   // Masked Load/Gather emulation was previously never allowed.
5866   // Limited number of Masked Store/Scatter emulation was allowed.
5867   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5868   return isa<LoadInst>(I) ||
5869          (isa<StoreInst>(I) &&
5870           NumPredStores > NumberOfStoresToPredicate);
5871 }
5872 
5873 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5874   // If we aren't vectorizing the loop, or if we've already collected the
5875   // instructions to scalarize, there's nothing to do. Collection may already
5876   // have occurred if we have a user-selected VF and are now computing the
5877   // expected cost for interleaving.
5878   if (VF.isScalar() || VF.isZero() ||
5879       InstsToScalarize.find(VF) != InstsToScalarize.end())
5880     return;
5881 
5882   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5883   // not profitable to scalarize any instructions, the presence of VF in the
5884   // map will indicate that we've analyzed it already.
5885   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5886 
5887   // Find all the instructions that are scalar with predication in the loop and
5888   // determine if it would be better to not if-convert the blocks they are in.
5889   // If so, we also record the instructions to scalarize.
5890   for (BasicBlock *BB : TheLoop->blocks()) {
5891     if (!blockNeedsPredication(BB))
5892       continue;
5893     for (Instruction &I : *BB)
5894       if (isScalarWithPredication(&I)) {
5895         ScalarCostsTy ScalarCosts;
5896         // Do not apply discount logic if hacked cost is needed
5897         // for emulated masked memrefs.
5898         if (!useEmulatedMaskMemRefHack(&I) &&
5899             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5900           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5901         // Remember that BB will remain after vectorization.
5902         PredicatedBBsAfterVectorization.insert(BB);
5903       }
5904   }
5905 }
5906 
5907 int LoopVectorizationCostModel::computePredInstDiscount(
5908     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5909     ElementCount VF) {
5910   assert(!isUniformAfterVectorization(PredInst, VF) &&
5911          "Instruction marked uniform-after-vectorization will be predicated");
5912 
5913   // Initialize the discount to zero, meaning that the scalar version and the
5914   // vector version cost the same.
5915   int Discount = 0;
5916 
5917   // Holds instructions to analyze. The instructions we visit are mapped in
5918   // ScalarCosts. Those instructions are the ones that would be scalarized if
5919   // we find that the scalar version costs less.
5920   SmallVector<Instruction *, 8> Worklist;
5921 
5922   // Returns true if the given instruction can be scalarized.
5923   auto canBeScalarized = [&](Instruction *I) -> bool {
5924     // We only attempt to scalarize instructions forming a single-use chain
5925     // from the original predicated block that would otherwise be vectorized.
5926     // Although not strictly necessary, we give up on instructions we know will
5927     // already be scalar to avoid traversing chains that are unlikely to be
5928     // beneficial.
5929     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5930         isScalarAfterVectorization(I, VF))
5931       return false;
5932 
5933     // If the instruction is scalar with predication, it will be analyzed
5934     // separately. We ignore it within the context of PredInst.
5935     if (isScalarWithPredication(I))
5936       return false;
5937 
5938     // If any of the instruction's operands are uniform after vectorization,
5939     // the instruction cannot be scalarized. This prevents, for example, a
5940     // masked load from being scalarized.
5941     //
5942     // We assume we will only emit a value for lane zero of an instruction
5943     // marked uniform after vectorization, rather than VF identical values.
5944     // Thus, if we scalarize an instruction that uses a uniform, we would
5945     // create uses of values corresponding to the lanes we aren't emitting code
5946     // for. This behavior can be changed by allowing getScalarValue to clone
5947     // the lane zero values for uniforms rather than asserting.
5948     for (Use &U : I->operands())
5949       if (auto *J = dyn_cast<Instruction>(U.get()))
5950         if (isUniformAfterVectorization(J, VF))
5951           return false;
5952 
5953     // Otherwise, we can scalarize the instruction.
5954     return true;
5955   };
5956 
5957   // Compute the expected cost discount from scalarizing the entire expression
5958   // feeding the predicated instruction. We currently only consider expressions
5959   // that are single-use instruction chains.
5960   Worklist.push_back(PredInst);
5961   while (!Worklist.empty()) {
5962     Instruction *I = Worklist.pop_back_val();
5963 
5964     // If we've already analyzed the instruction, there's nothing to do.
5965     if (ScalarCosts.find(I) != ScalarCosts.end())
5966       continue;
5967 
5968     // Compute the cost of the vector instruction. Note that this cost already
5969     // includes the scalarization overhead of the predicated instruction.
5970     unsigned VectorCost = getInstructionCost(I, VF).first;
5971 
5972     // Compute the cost of the scalarized instruction. This cost is the cost of
5973     // the instruction as if it wasn't if-converted and instead remained in the
5974     // predicated block. We will scale this cost by block probability after
5975     // computing the scalarization overhead.
5976     assert(!VF.Scalable && "scalable vectors not yet supported.");
5977     unsigned ScalarCost =
5978         VF.Min * getInstructionCost(I, ElementCount::getFixed(1)).first;
5979 
5980     // Compute the scalarization overhead of needed insertelement instructions
5981     // and phi nodes.
5982     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5983       ScalarCost += TTI.getScalarizationOverhead(
5984           cast<VectorType>(ToVectorTy(I->getType(), VF)),
5985           APInt::getAllOnesValue(VF.Min), true, false);
5986       assert(!VF.Scalable && "scalable vectors not yet supported.");
5987       ScalarCost +=
5988           VF.Min *
5989           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
5990     }
5991 
5992     // Compute the scalarization overhead of needed extractelement
5993     // instructions. For each of the instruction's operands, if the operand can
5994     // be scalarized, add it to the worklist; otherwise, account for the
5995     // overhead.
5996     for (Use &U : I->operands())
5997       if (auto *J = dyn_cast<Instruction>(U.get())) {
5998         assert(VectorType::isValidElementType(J->getType()) &&
5999                "Instruction has non-scalar type");
6000         if (canBeScalarized(J))
6001           Worklist.push_back(J);
6002         else if (needsExtract(J, VF)) {
6003           assert(!VF.Scalable && "scalable vectors not yet supported.");
6004           ScalarCost += TTI.getScalarizationOverhead(
6005               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6006               APInt::getAllOnesValue(VF.Min), false, true);
6007         }
6008       }
6009 
6010     // Scale the total scalar cost by block probability.
6011     ScalarCost /= getReciprocalPredBlockProb();
6012 
6013     // Compute the discount. A non-negative discount means the vector version
6014     // of the instruction costs more, and scalarizing would be beneficial.
6015     Discount += VectorCost - ScalarCost;
6016     ScalarCosts[I] = ScalarCost;
6017   }
6018 
6019   return Discount;
6020 }
6021 
6022 LoopVectorizationCostModel::VectorizationCostTy
6023 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6024   assert(!VF.Scalable && "scalable vectors not yet supported.");
6025   VectorizationCostTy Cost;
6026 
6027   // For each block.
6028   for (BasicBlock *BB : TheLoop->blocks()) {
6029     VectorizationCostTy BlockCost;
6030 
6031     // For each instruction in the old loop.
6032     for (Instruction &I : BB->instructionsWithoutDebug()) {
6033       // Skip ignored values.
6034       if (ValuesToIgnore.count(&I) ||
6035           (VF.isVector() && VecValuesToIgnore.count(&I)))
6036         continue;
6037 
6038       VectorizationCostTy C = getInstructionCost(&I, VF);
6039 
6040       // Check if we should override the cost.
6041       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6042         C.first = ForceTargetInstructionCost;
6043 
6044       BlockCost.first += C.first;
6045       BlockCost.second |= C.second;
6046       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6047                         << " for VF " << VF << " For instruction: " << I
6048                         << '\n');
6049     }
6050 
6051     // If we are vectorizing a predicated block, it will have been
6052     // if-converted. This means that the block's instructions (aside from
6053     // stores and instructions that may divide by zero) will now be
6054     // unconditionally executed. For the scalar case, we may not always execute
6055     // the predicated block. Thus, scale the block's cost by the probability of
6056     // executing it.
6057     if (VF.isScalar() && blockNeedsPredication(BB))
6058       BlockCost.first /= getReciprocalPredBlockProb();
6059 
6060     Cost.first += BlockCost.first;
6061     Cost.second |= BlockCost.second;
6062   }
6063 
6064   return Cost;
6065 }
6066 
6067 /// Gets Address Access SCEV after verifying that the access pattern
6068 /// is loop invariant except the induction variable dependence.
6069 ///
6070 /// This SCEV can be sent to the Target in order to estimate the address
6071 /// calculation cost.
6072 static const SCEV *getAddressAccessSCEV(
6073               Value *Ptr,
6074               LoopVectorizationLegality *Legal,
6075               PredicatedScalarEvolution &PSE,
6076               const Loop *TheLoop) {
6077 
6078   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6079   if (!Gep)
6080     return nullptr;
6081 
6082   // We are looking for a gep with all loop invariant indices except for one
6083   // which should be an induction variable.
6084   auto SE = PSE.getSE();
6085   unsigned NumOperands = Gep->getNumOperands();
6086   for (unsigned i = 1; i < NumOperands; ++i) {
6087     Value *Opd = Gep->getOperand(i);
6088     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6089         !Legal->isInductionVariable(Opd))
6090       return nullptr;
6091   }
6092 
6093   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6094   return PSE.getSCEV(Ptr);
6095 }
6096 
6097 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6098   return Legal->hasStride(I->getOperand(0)) ||
6099          Legal->hasStride(I->getOperand(1));
6100 }
6101 
6102 unsigned
6103 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6104                                                         ElementCount VF) {
6105   assert(VF.isVector() &&
6106          "Scalarization cost of instruction implies vectorization.");
6107   assert(!VF.Scalable && "scalable vectors not yet supported.");
6108   Type *ValTy = getMemInstValueType(I);
6109   auto SE = PSE.getSE();
6110 
6111   unsigned AS = getLoadStoreAddressSpace(I);
6112   Value *Ptr = getLoadStorePointerOperand(I);
6113   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6114 
6115   // Figure out whether the access is strided and get the stride value
6116   // if it's known in compile time
6117   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6118 
6119   // Get the cost of the scalar memory instruction and address computation.
6120   unsigned Cost = VF.Min * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6121 
6122   // Don't pass *I here, since it is scalar but will actually be part of a
6123   // vectorized loop where the user of it is a vectorized instruction.
6124   const Align Alignment = getLoadStoreAlignment(I);
6125   Cost += VF.Min *
6126           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6127                               AS, TTI::TCK_RecipThroughput);
6128 
6129   // Get the overhead of the extractelement and insertelement instructions
6130   // we might create due to scalarization.
6131   Cost += getScalarizationOverhead(I, VF);
6132 
6133   // If we have a predicated store, it may not be executed for each vector
6134   // lane. Scale the cost by the probability of executing the predicated
6135   // block.
6136   if (isPredicatedInst(I)) {
6137     Cost /= getReciprocalPredBlockProb();
6138 
6139     if (useEmulatedMaskMemRefHack(I))
6140       // Artificially setting to a high enough value to practically disable
6141       // vectorization with such operations.
6142       Cost = 3000000;
6143   }
6144 
6145   return Cost;
6146 }
6147 
6148 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6149                                                              ElementCount VF) {
6150   Type *ValTy = getMemInstValueType(I);
6151   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6152   Value *Ptr = getLoadStorePointerOperand(I);
6153   unsigned AS = getLoadStoreAddressSpace(I);
6154   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6155   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6156 
6157   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6158          "Stride should be 1 or -1 for consecutive memory access");
6159   const Align Alignment = getLoadStoreAlignment(I);
6160   unsigned Cost = 0;
6161   if (Legal->isMaskRequired(I))
6162     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6163                                       CostKind);
6164   else
6165     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6166                                 CostKind, I);
6167 
6168   bool Reverse = ConsecutiveStride < 0;
6169   if (Reverse)
6170     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6171   return Cost;
6172 }
6173 
6174 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6175                                                          ElementCount VF) {
6176   Type *ValTy = getMemInstValueType(I);
6177   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6178   const Align Alignment = getLoadStoreAlignment(I);
6179   unsigned AS = getLoadStoreAddressSpace(I);
6180   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6181   if (isa<LoadInst>(I)) {
6182     return TTI.getAddressComputationCost(ValTy) +
6183            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6184                                CostKind) +
6185            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6186   }
6187   StoreInst *SI = cast<StoreInst>(I);
6188 
6189   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6190   return TTI.getAddressComputationCost(ValTy) +
6191          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6192                              CostKind) +
6193          (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(
6194                                               Instruction::ExtractElement,
6195                                               VectorTy, VF.Min - 1));
6196 }
6197 
6198 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6199                                                           ElementCount VF) {
6200   Type *ValTy = getMemInstValueType(I);
6201   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6202   const Align Alignment = getLoadStoreAlignment(I);
6203   const Value *Ptr = getLoadStorePointerOperand(I);
6204 
6205   return TTI.getAddressComputationCost(VectorTy) +
6206          TTI.getGatherScatterOpCost(
6207              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6208              TargetTransformInfo::TCK_RecipThroughput, I);
6209 }
6210 
6211 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6212                                                             ElementCount VF) {
6213   Type *ValTy = getMemInstValueType(I);
6214   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6215   unsigned AS = getLoadStoreAddressSpace(I);
6216 
6217   auto Group = getInterleavedAccessGroup(I);
6218   assert(Group && "Fail to get an interleaved access group.");
6219 
6220   unsigned InterleaveFactor = Group->getFactor();
6221   assert(!VF.Scalable && "scalable vectors not yet supported.");
6222   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6223 
6224   // Holds the indices of existing members in an interleaved load group.
6225   // An interleaved store group doesn't need this as it doesn't allow gaps.
6226   SmallVector<unsigned, 4> Indices;
6227   if (isa<LoadInst>(I)) {
6228     for (unsigned i = 0; i < InterleaveFactor; i++)
6229       if (Group->getMember(i))
6230         Indices.push_back(i);
6231   }
6232 
6233   // Calculate the cost of the whole interleaved group.
6234   bool UseMaskForGaps =
6235       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6236   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6237       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6238       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6239 
6240   if (Group->isReverse()) {
6241     // TODO: Add support for reversed masked interleaved access.
6242     assert(!Legal->isMaskRequired(I) &&
6243            "Reverse masked interleaved access not supported.");
6244     Cost += Group->getNumMembers() *
6245             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6246   }
6247   return Cost;
6248 }
6249 
6250 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6251                                                               ElementCount VF) {
6252   // Calculate scalar cost only. Vectorization cost should be ready at this
6253   // moment.
6254   if (VF.isScalar()) {
6255     Type *ValTy = getMemInstValueType(I);
6256     const Align Alignment = getLoadStoreAlignment(I);
6257     unsigned AS = getLoadStoreAddressSpace(I);
6258 
6259     return TTI.getAddressComputationCost(ValTy) +
6260            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6261                                TTI::TCK_RecipThroughput, I);
6262   }
6263   return getWideningCost(I, VF);
6264 }
6265 
6266 LoopVectorizationCostModel::VectorizationCostTy
6267 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6268                                                ElementCount VF) {
6269   assert(!VF.Scalable &&
6270          "the cost model is not yet implemented for scalable vectorization");
6271   // If we know that this instruction will remain uniform, check the cost of
6272   // the scalar version.
6273   if (isUniformAfterVectorization(I, VF))
6274     VF = ElementCount::getFixed(1);
6275 
6276   if (VF.isVector() && isProfitableToScalarize(I, VF))
6277     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6278 
6279   // Forced scalars do not have any scalarization overhead.
6280   auto ForcedScalar = ForcedScalars.find(VF);
6281   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6282     auto InstSet = ForcedScalar->second;
6283     if (InstSet.count(I))
6284       return VectorizationCostTy(
6285           (getInstructionCost(I, ElementCount::getFixed(1)).first * VF.Min),
6286           false);
6287   }
6288 
6289   Type *VectorTy;
6290   unsigned C = getInstructionCost(I, VF, VectorTy);
6291 
6292   bool TypeNotScalarized = VF.isVector() && VectorTy->isVectorTy() &&
6293                            TTI.getNumberOfParts(VectorTy) < VF.Min;
6294   return VectorizationCostTy(C, TypeNotScalarized);
6295 }
6296 
6297 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6298                                                               ElementCount VF) {
6299 
6300   assert(!VF.Scalable &&
6301          "cannot compute scalarization overhead for scalable vectorization");
6302   if (VF.isScalar())
6303     return 0;
6304 
6305   unsigned Cost = 0;
6306   Type *RetTy = ToVectorTy(I->getType(), VF);
6307   if (!RetTy->isVoidTy() &&
6308       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6309     Cost += TTI.getScalarizationOverhead(
6310         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.Min), true, false);
6311 
6312   // Some targets keep addresses scalar.
6313   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6314     return Cost;
6315 
6316   // Some targets support efficient element stores.
6317   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6318     return Cost;
6319 
6320   // Collect operands to consider.
6321   CallInst *CI = dyn_cast<CallInst>(I);
6322   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6323 
6324   // Skip operands that do not require extraction/scalarization and do not incur
6325   // any overhead.
6326   return Cost +
6327          TTI.getOperandsScalarizationOverhead(filterExtractingOperands(Ops, VF),
6328                                               VF.Min);
6329 }
6330 
6331 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6332   assert(!VF.Scalable && "scalable vectors not yet supported.");
6333   if (VF.isScalar())
6334     return;
6335   NumPredStores = 0;
6336   for (BasicBlock *BB : TheLoop->blocks()) {
6337     // For each instruction in the old loop.
6338     for (Instruction &I : *BB) {
6339       Value *Ptr =  getLoadStorePointerOperand(&I);
6340       if (!Ptr)
6341         continue;
6342 
6343       // TODO: We should generate better code and update the cost model for
6344       // predicated uniform stores. Today they are treated as any other
6345       // predicated store (see added test cases in
6346       // invariant-store-vectorization.ll).
6347       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6348         NumPredStores++;
6349 
6350       if (Legal->isUniform(Ptr) &&
6351           // Conditional loads and stores should be scalarized and predicated.
6352           // isScalarWithPredication cannot be used here since masked
6353           // gather/scatters are not considered scalar with predication.
6354           !Legal->blockNeedsPredication(I.getParent())) {
6355         // TODO: Avoid replicating loads and stores instead of
6356         // relying on instcombine to remove them.
6357         // Load: Scalar load + broadcast
6358         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6359         unsigned Cost = getUniformMemOpCost(&I, VF);
6360         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6361         continue;
6362       }
6363 
6364       // We assume that widening is the best solution when possible.
6365       if (memoryInstructionCanBeWidened(&I, VF)) {
6366         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6367         int ConsecutiveStride =
6368                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6369         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6370                "Expected consecutive stride.");
6371         InstWidening Decision =
6372             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6373         setWideningDecision(&I, VF, Decision, Cost);
6374         continue;
6375       }
6376 
6377       // Choose between Interleaving, Gather/Scatter or Scalarization.
6378       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6379       unsigned NumAccesses = 1;
6380       if (isAccessInterleaved(&I)) {
6381         auto Group = getInterleavedAccessGroup(&I);
6382         assert(Group && "Fail to get an interleaved access group.");
6383 
6384         // Make one decision for the whole group.
6385         if (getWideningDecision(&I, VF) != CM_Unknown)
6386           continue;
6387 
6388         NumAccesses = Group->getNumMembers();
6389         if (interleavedAccessCanBeWidened(&I, VF))
6390           InterleaveCost = getInterleaveGroupCost(&I, VF);
6391       }
6392 
6393       unsigned GatherScatterCost =
6394           isLegalGatherOrScatter(&I)
6395               ? getGatherScatterCost(&I, VF) * NumAccesses
6396               : std::numeric_limits<unsigned>::max();
6397 
6398       unsigned ScalarizationCost =
6399           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6400 
6401       // Choose better solution for the current VF,
6402       // write down this decision and use it during vectorization.
6403       unsigned Cost;
6404       InstWidening Decision;
6405       if (InterleaveCost <= GatherScatterCost &&
6406           InterleaveCost < ScalarizationCost) {
6407         Decision = CM_Interleave;
6408         Cost = InterleaveCost;
6409       } else if (GatherScatterCost < ScalarizationCost) {
6410         Decision = CM_GatherScatter;
6411         Cost = GatherScatterCost;
6412       } else {
6413         Decision = CM_Scalarize;
6414         Cost = ScalarizationCost;
6415       }
6416       // If the instructions belongs to an interleave group, the whole group
6417       // receives the same decision. The whole group receives the cost, but
6418       // the cost will actually be assigned to one instruction.
6419       if (auto Group = getInterleavedAccessGroup(&I))
6420         setWideningDecision(Group, VF, Decision, Cost);
6421       else
6422         setWideningDecision(&I, VF, Decision, Cost);
6423     }
6424   }
6425 
6426   // Make sure that any load of address and any other address computation
6427   // remains scalar unless there is gather/scatter support. This avoids
6428   // inevitable extracts into address registers, and also has the benefit of
6429   // activating LSR more, since that pass can't optimize vectorized
6430   // addresses.
6431   if (TTI.prefersVectorizedAddressing())
6432     return;
6433 
6434   // Start with all scalar pointer uses.
6435   SmallPtrSet<Instruction *, 8> AddrDefs;
6436   for (BasicBlock *BB : TheLoop->blocks())
6437     for (Instruction &I : *BB) {
6438       Instruction *PtrDef =
6439         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6440       if (PtrDef && TheLoop->contains(PtrDef) &&
6441           getWideningDecision(&I, VF) != CM_GatherScatter)
6442         AddrDefs.insert(PtrDef);
6443     }
6444 
6445   // Add all instructions used to generate the addresses.
6446   SmallVector<Instruction *, 4> Worklist;
6447   for (auto *I : AddrDefs)
6448     Worklist.push_back(I);
6449   while (!Worklist.empty()) {
6450     Instruction *I = Worklist.pop_back_val();
6451     for (auto &Op : I->operands())
6452       if (auto *InstOp = dyn_cast<Instruction>(Op))
6453         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6454             AddrDefs.insert(InstOp).second)
6455           Worklist.push_back(InstOp);
6456   }
6457 
6458   for (auto *I : AddrDefs) {
6459     if (isa<LoadInst>(I)) {
6460       // Setting the desired widening decision should ideally be handled in
6461       // by cost functions, but since this involves the task of finding out
6462       // if the loaded register is involved in an address computation, it is
6463       // instead changed here when we know this is the case.
6464       InstWidening Decision = getWideningDecision(I, VF);
6465       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6466         // Scalarize a widened load of address.
6467         setWideningDecision(
6468             I, VF, CM_Scalarize,
6469             (VF.Min * getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6470       else if (auto Group = getInterleavedAccessGroup(I)) {
6471         // Scalarize an interleave group of address loads.
6472         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6473           if (Instruction *Member = Group->getMember(I))
6474             setWideningDecision(
6475                 Member, VF, CM_Scalarize,
6476                 (VF.Min *
6477                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6478         }
6479       }
6480     } else
6481       // Make sure I gets scalarized and a cost estimate without
6482       // scalarization overhead.
6483       ForcedScalars[VF].insert(I);
6484   }
6485 }
6486 
6487 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6488                                                         ElementCount VF,
6489                                                         Type *&VectorTy) {
6490   Type *RetTy = I->getType();
6491   if (canTruncateToMinimalBitwidth(I, VF))
6492     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6493   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6494   auto SE = PSE.getSE();
6495   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6496 
6497   // TODO: We need to estimate the cost of intrinsic calls.
6498   switch (I->getOpcode()) {
6499   case Instruction::GetElementPtr:
6500     // We mark this instruction as zero-cost because the cost of GEPs in
6501     // vectorized code depends on whether the corresponding memory instruction
6502     // is scalarized or not. Therefore, we handle GEPs with the memory
6503     // instruction cost.
6504     return 0;
6505   case Instruction::Br: {
6506     // In cases of scalarized and predicated instructions, there will be VF
6507     // predicated blocks in the vectorized loop. Each branch around these
6508     // blocks requires also an extract of its vector compare i1 element.
6509     bool ScalarPredicatedBB = false;
6510     BranchInst *BI = cast<BranchInst>(I);
6511     if (VF.isVector() && BI->isConditional() &&
6512         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6513          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6514       ScalarPredicatedBB = true;
6515 
6516     if (ScalarPredicatedBB) {
6517       // Return cost for branches around scalarized and predicated blocks.
6518       assert(!VF.Scalable && "scalable vectors not yet supported.");
6519       auto *Vec_i1Ty =
6520           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6521       return (TTI.getScalarizationOverhead(
6522                   Vec_i1Ty, APInt::getAllOnesValue(VF.Min), false, true) +
6523               (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.Min));
6524     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6525       // The back-edge branch will remain, as will all scalar branches.
6526       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6527     else
6528       // This branch will be eliminated by if-conversion.
6529       return 0;
6530     // Note: We currently assume zero cost for an unconditional branch inside
6531     // a predicated block since it will become a fall-through, although we
6532     // may decide in the future to call TTI for all branches.
6533   }
6534   case Instruction::PHI: {
6535     auto *Phi = cast<PHINode>(I);
6536 
6537     // First-order recurrences are replaced by vector shuffles inside the loop.
6538     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6539     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6540       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6541                                 cast<VectorType>(VectorTy), VF.Min - 1,
6542                                 FixedVectorType::get(RetTy, 1));
6543 
6544     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6545     // converted into select instructions. We require N - 1 selects per phi
6546     // node, where N is the number of incoming values.
6547     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6548       return (Phi->getNumIncomingValues() - 1) *
6549              TTI.getCmpSelInstrCost(
6550                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6551                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6552                  CostKind);
6553 
6554     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6555   }
6556   case Instruction::UDiv:
6557   case Instruction::SDiv:
6558   case Instruction::URem:
6559   case Instruction::SRem:
6560     // If we have a predicated instruction, it may not be executed for each
6561     // vector lane. Get the scalarization cost and scale this amount by the
6562     // probability of executing the predicated block. If the instruction is not
6563     // predicated, we fall through to the next case.
6564     if (VF.isVector() && isScalarWithPredication(I)) {
6565       unsigned Cost = 0;
6566 
6567       // These instructions have a non-void type, so account for the phi nodes
6568       // that we will create. This cost is likely to be zero. The phi node
6569       // cost, if any, should be scaled by the block probability because it
6570       // models a copy at the end of each predicated block.
6571       Cost += VF.Min * TTI.getCFInstrCost(Instruction::PHI, CostKind);
6572 
6573       // The cost of the non-predicated instruction.
6574       Cost +=
6575           VF.Min * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6576 
6577       // The cost of insertelement and extractelement instructions needed for
6578       // scalarization.
6579       Cost += getScalarizationOverhead(I, VF);
6580 
6581       // Scale the cost by the probability of executing the predicated blocks.
6582       // This assumes the predicated block for each vector lane is equally
6583       // likely.
6584       return Cost / getReciprocalPredBlockProb();
6585     }
6586     LLVM_FALLTHROUGH;
6587   case Instruction::Add:
6588   case Instruction::FAdd:
6589   case Instruction::Sub:
6590   case Instruction::FSub:
6591   case Instruction::Mul:
6592   case Instruction::FMul:
6593   case Instruction::FDiv:
6594   case Instruction::FRem:
6595   case Instruction::Shl:
6596   case Instruction::LShr:
6597   case Instruction::AShr:
6598   case Instruction::And:
6599   case Instruction::Or:
6600   case Instruction::Xor: {
6601     // Since we will replace the stride by 1 the multiplication should go away.
6602     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6603       return 0;
6604     // Certain instructions can be cheaper to vectorize if they have a constant
6605     // second vector operand. One example of this are shifts on x86.
6606     Value *Op2 = I->getOperand(1);
6607     TargetTransformInfo::OperandValueProperties Op2VP;
6608     TargetTransformInfo::OperandValueKind Op2VK =
6609         TTI.getOperandInfo(Op2, Op2VP);
6610     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6611       Op2VK = TargetTransformInfo::OK_UniformValue;
6612 
6613     SmallVector<const Value *, 4> Operands(I->operand_values());
6614     unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1;
6615     return N * TTI.getArithmeticInstrCost(
6616                    I->getOpcode(), VectorTy, CostKind,
6617                    TargetTransformInfo::OK_AnyValue,
6618                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6619   }
6620   case Instruction::FNeg: {
6621     assert(!VF.Scalable && "VF is assumed to be non scalable.");
6622     unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1;
6623     return N * TTI.getArithmeticInstrCost(
6624                    I->getOpcode(), VectorTy, CostKind,
6625                    TargetTransformInfo::OK_AnyValue,
6626                    TargetTransformInfo::OK_AnyValue,
6627                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6628                    I->getOperand(0), I);
6629   }
6630   case Instruction::Select: {
6631     SelectInst *SI = cast<SelectInst>(I);
6632     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6633     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6634     Type *CondTy = SI->getCondition()->getType();
6635     if (!ScalarCond) {
6636       assert(!VF.Scalable && "VF is assumed to be non scalable.");
6637       CondTy = VectorType::get(CondTy, VF);
6638     }
6639     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6640                                   CostKind, I);
6641   }
6642   case Instruction::ICmp:
6643   case Instruction::FCmp: {
6644     Type *ValTy = I->getOperand(0)->getType();
6645     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6646     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6647       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6648     VectorTy = ToVectorTy(ValTy, VF);
6649     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
6650                                   I);
6651   }
6652   case Instruction::Store:
6653   case Instruction::Load: {
6654     ElementCount Width = VF;
6655     if (Width.isVector()) {
6656       InstWidening Decision = getWideningDecision(I, Width);
6657       assert(Decision != CM_Unknown &&
6658              "CM decision should be taken at this point");
6659       if (Decision == CM_Scalarize)
6660         Width = ElementCount::getFixed(1);
6661     }
6662     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6663     return getMemoryInstructionCost(I, VF);
6664   }
6665   case Instruction::ZExt:
6666   case Instruction::SExt:
6667   case Instruction::FPToUI:
6668   case Instruction::FPToSI:
6669   case Instruction::FPExt:
6670   case Instruction::PtrToInt:
6671   case Instruction::IntToPtr:
6672   case Instruction::SIToFP:
6673   case Instruction::UIToFP:
6674   case Instruction::Trunc:
6675   case Instruction::FPTrunc:
6676   case Instruction::BitCast: {
6677     // Computes the CastContextHint from a Load/Store instruction.
6678     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6679       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6680              "Expected a load or a store!");
6681 
6682       if (VF.isScalar() || !TheLoop->contains(I))
6683         return TTI::CastContextHint::Normal;
6684 
6685       switch (getWideningDecision(I, VF)) {
6686       case LoopVectorizationCostModel::CM_GatherScatter:
6687         return TTI::CastContextHint::GatherScatter;
6688       case LoopVectorizationCostModel::CM_Interleave:
6689         return TTI::CastContextHint::Interleave;
6690       case LoopVectorizationCostModel::CM_Scalarize:
6691       case LoopVectorizationCostModel::CM_Widen:
6692         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6693                                         : TTI::CastContextHint::Normal;
6694       case LoopVectorizationCostModel::CM_Widen_Reverse:
6695         return TTI::CastContextHint::Reversed;
6696       case LoopVectorizationCostModel::CM_Unknown:
6697         llvm_unreachable("Instr did not go through cost modelling?");
6698       }
6699 
6700       llvm_unreachable("Unhandled case!");
6701     };
6702 
6703     unsigned Opcode = I->getOpcode();
6704     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6705     // For Trunc, the context is the only user, which must be a StoreInst.
6706     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6707       if (I->hasOneUse())
6708         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6709           CCH = ComputeCCH(Store);
6710     }
6711     // For Z/Sext, the context is the operand, which must be a LoadInst.
6712     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6713              Opcode == Instruction::FPExt) {
6714       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6715         CCH = ComputeCCH(Load);
6716     }
6717 
6718     // We optimize the truncation of induction variables having constant
6719     // integer steps. The cost of these truncations is the same as the scalar
6720     // operation.
6721     if (isOptimizableIVTruncate(I, VF)) {
6722       auto *Trunc = cast<TruncInst>(I);
6723       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6724                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6725     }
6726 
6727     Type *SrcScalarTy = I->getOperand(0)->getType();
6728     Type *SrcVecTy =
6729         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6730     if (canTruncateToMinimalBitwidth(I, VF)) {
6731       // This cast is going to be shrunk. This may remove the cast or it might
6732       // turn it into slightly different cast. For example, if MinBW == 16,
6733       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6734       //
6735       // Calculate the modified src and dest types.
6736       Type *MinVecTy = VectorTy;
6737       if (Opcode == Instruction::Trunc) {
6738         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6739         VectorTy =
6740             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6741       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
6742         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6743         VectorTy =
6744             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6745       }
6746     }
6747 
6748     assert(!VF.Scalable && "VF is assumed to be non scalable");
6749     unsigned N = isScalarAfterVectorization(I, VF) ? VF.Min : 1;
6750     return N *
6751            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6752   }
6753   case Instruction::Call: {
6754     bool NeedToScalarize;
6755     CallInst *CI = cast<CallInst>(I);
6756     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6757     if (getVectorIntrinsicIDForCall(CI, TLI))
6758       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6759     return CallCost;
6760   }
6761   default:
6762     // The cost of executing VF copies of the scalar instruction. This opcode
6763     // is unknown. Assume that it is the same as 'mul'.
6764     return VF.Min *
6765                TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
6766                                           CostKind) +
6767            getScalarizationOverhead(I, VF);
6768   } // end of switch.
6769 }
6770 
6771 char LoopVectorize::ID = 0;
6772 
6773 static const char lv_name[] = "Loop Vectorization";
6774 
6775 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6776 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6777 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6778 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6779 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6780 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6781 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6782 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6783 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6784 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6785 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6786 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6787 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6788 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6789 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6790 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6791 
6792 namespace llvm {
6793 
6794 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6795 
6796 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6797                               bool VectorizeOnlyWhenForced) {
6798   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6799 }
6800 
6801 } // end namespace llvm
6802 
6803 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6804   // Check if the pointer operand of a load or store instruction is
6805   // consecutive.
6806   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6807     return Legal->isConsecutivePtr(Ptr);
6808   return false;
6809 }
6810 
6811 void LoopVectorizationCostModel::collectValuesToIgnore() {
6812   // Ignore ephemeral values.
6813   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6814 
6815   // Ignore type-promoting instructions we identified during reduction
6816   // detection.
6817   for (auto &Reduction : Legal->getReductionVars()) {
6818     RecurrenceDescriptor &RedDes = Reduction.second;
6819     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6820     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6821   }
6822   // Ignore type-casting instructions we identified during induction
6823   // detection.
6824   for (auto &Induction : Legal->getInductionVars()) {
6825     InductionDescriptor &IndDes = Induction.second;
6826     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6827     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6828   }
6829 }
6830 
6831 void LoopVectorizationCostModel::collectInLoopReductions() {
6832   // For the moment, without predicated reduction instructions, we do not
6833   // support inloop reductions whilst folding the tail, and hence in those cases
6834   // all reductions are currently out of the loop.
6835   if (!PreferInLoopReductions || foldTailByMasking())
6836     return;
6837 
6838   for (auto &Reduction : Legal->getReductionVars()) {
6839     PHINode *Phi = Reduction.first;
6840     RecurrenceDescriptor &RdxDesc = Reduction.second;
6841 
6842     // We don't collect reductions that are type promoted (yet).
6843     if (RdxDesc.getRecurrenceType() != Phi->getType())
6844       continue;
6845 
6846     // Check that we can correctly put the reductions into the loop, by
6847     // finding the chain of operations that leads from the phi to the loop
6848     // exit value.
6849     SmallVector<Instruction *, 4> ReductionOperations =
6850         RdxDesc.getReductionOpChain(Phi, TheLoop);
6851     bool InLoop = !ReductionOperations.empty();
6852     if (InLoop)
6853       InLoopReductionChains[Phi] = ReductionOperations;
6854     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6855                       << " reduction for phi: " << *Phi << "\n");
6856   }
6857 }
6858 
6859 // TODO: we could return a pair of values that specify the max VF and
6860 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6861 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6862 // doesn't have a cost model that can choose which plan to execute if
6863 // more than one is generated.
6864 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6865                                  LoopVectorizationCostModel &CM) {
6866   unsigned WidestType;
6867   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6868   return WidestVectorRegBits / WidestType;
6869 }
6870 
6871 VectorizationFactor
6872 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6873   assert(!UserVF.Scalable && "scalable vectors not yet supported");
6874   ElementCount VF = UserVF;
6875   // Outer loop handling: They may require CFG and instruction level
6876   // transformations before even evaluating whether vectorization is profitable.
6877   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6878   // the vectorization pipeline.
6879   if (!OrigLoop->empty()) {
6880     // If the user doesn't provide a vectorization factor, determine a
6881     // reasonable one.
6882     if (UserVF.isZero()) {
6883       VF = ElementCount::getFixed(
6884           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
6885       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6886 
6887       // Make sure we have a VF > 1 for stress testing.
6888       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6889         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6890                           << "overriding computed VF.\n");
6891         VF = ElementCount::getFixed(4);
6892       }
6893     }
6894     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6895     assert(isPowerOf2_32(VF.Min) && "VF needs to be a power of two");
6896     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6897                       << "VF " << VF << " to build VPlans.\n");
6898     buildVPlans(VF.Min, VF.Min);
6899 
6900     // For VPlan build stress testing, we bail out after VPlan construction.
6901     if (VPlanBuildStressTest)
6902       return VectorizationFactor::Disabled();
6903 
6904     return {VF, 0 /*Cost*/};
6905   }
6906 
6907   LLVM_DEBUG(
6908       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6909                 "VPlan-native path.\n");
6910   return VectorizationFactor::Disabled();
6911 }
6912 
6913 Optional<VectorizationFactor>
6914 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6915   assert(!UserVF.Scalable && "scalable vectorization not yet handled");
6916   assert(OrigLoop->empty() && "Inner loop expected.");
6917   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF.Min, UserIC);
6918   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6919     return None;
6920 
6921   // Invalidate interleave groups if all blocks of loop will be predicated.
6922   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6923       !useMaskedInterleavedAccesses(*TTI)) {
6924     LLVM_DEBUG(
6925         dbgs()
6926         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6927            "which requires masked-interleaved support.\n");
6928     if (CM.InterleaveInfo.invalidateGroups())
6929       // Invalidating interleave groups also requires invalidating all decisions
6930       // based on them, which includes widening decisions and uniform and scalar
6931       // values.
6932       CM.invalidateCostModelingDecisions();
6933   }
6934 
6935   if (!UserVF.isZero()) {
6936     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6937     assert(isPowerOf2_32(UserVF.Min) && "VF needs to be a power of two");
6938     // Collect the instructions (and their associated costs) that will be more
6939     // profitable to scalarize.
6940     CM.selectUserVectorizationFactor(UserVF);
6941     CM.collectInLoopReductions();
6942     buildVPlansWithVPRecipes(UserVF.Min, UserVF.Min);
6943     LLVM_DEBUG(printPlans(dbgs()));
6944     return {{UserVF, 0}};
6945   }
6946 
6947   unsigned MaxVF = MaybeMaxVF.getValue();
6948   assert(MaxVF != 0 && "MaxVF is zero.");
6949 
6950   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6951     // Collect Uniform and Scalar instructions after vectorization with VF.
6952     CM.collectUniformsAndScalars(ElementCount::getFixed(VF));
6953 
6954     // Collect the instructions (and their associated costs) that will be more
6955     // profitable to scalarize.
6956     if (VF > 1)
6957       CM.collectInstsToScalarize(ElementCount::getFixed(VF));
6958   }
6959 
6960   CM.collectInLoopReductions();
6961 
6962   buildVPlansWithVPRecipes(1, MaxVF);
6963   LLVM_DEBUG(printPlans(dbgs()));
6964   if (MaxVF == 1)
6965     return VectorizationFactor::Disabled();
6966 
6967   // Select the optimal vectorization factor.
6968   return CM.selectVectorizationFactor(MaxVF);
6969 }
6970 
6971 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
6972   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6973                     << '\n');
6974   BestVF = VF;
6975   BestUF = UF;
6976 
6977   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6978     return !Plan->hasVF(VF);
6979   });
6980   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6981 }
6982 
6983 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6984                                            DominatorTree *DT) {
6985   // Perform the actual loop transformation.
6986 
6987   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6988   VPCallbackILV CallbackILV(ILV);
6989 
6990   assert(BestVF.hasValue() && "Vectorization Factor is missing");
6991 
6992   VPTransformState State{*BestVF, BestUF,      LI,
6993                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
6994                          &ILV,    CallbackILV};
6995   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6996   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6997   State.CanonicalIV = ILV.Induction;
6998 
6999   //===------------------------------------------------===//
7000   //
7001   // Notice: any optimization or new instruction that go
7002   // into the code below should also be implemented in
7003   // the cost-model.
7004   //
7005   //===------------------------------------------------===//
7006 
7007   // 2. Copy and widen instructions from the old loop into the new loop.
7008   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7009   VPlans.front()->execute(&State);
7010 
7011   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7012   //    predication, updating analyses.
7013   ILV.fixVectorizedLoop();
7014 }
7015 
7016 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7017     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7018   BasicBlock *Latch = OrigLoop->getLoopLatch();
7019 
7020   // We create new control-flow for the vectorized loop, so the original
7021   // condition will be dead after vectorization if it's only used by the
7022   // branch.
7023   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
7024   if (Cmp && Cmp->hasOneUse())
7025     DeadInstructions.insert(Cmp);
7026 
7027   // We create new "steps" for induction variable updates to which the original
7028   // induction variables map. An original update instruction will be dead if
7029   // all its users except the induction variable are dead.
7030   for (auto &Induction : Legal->getInductionVars()) {
7031     PHINode *Ind = Induction.first;
7032     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7033     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7034           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7035         }))
7036       DeadInstructions.insert(IndUpdate);
7037 
7038     // We record as "Dead" also the type-casting instructions we had identified
7039     // during induction analysis. We don't need any handling for them in the
7040     // vectorized loop because we have proven that, under a proper runtime
7041     // test guarding the vectorized loop, the value of the phi, and the casted
7042     // value of the phi, are the same. The last instruction in this casting chain
7043     // will get its scalar/vector/widened def from the scalar/vector/widened def
7044     // of the respective phi node. Any other casts in the induction def-use chain
7045     // have no other uses outside the phi update chain, and will be ignored.
7046     InductionDescriptor &IndDes = Induction.second;
7047     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7048     DeadInstructions.insert(Casts.begin(), Casts.end());
7049   }
7050 }
7051 
7052 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7053 
7054 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7055 
7056 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7057                                         Instruction::BinaryOps BinOp) {
7058   // When unrolling and the VF is 1, we only need to add a simple scalar.
7059   Type *Ty = Val->getType();
7060   assert(!Ty->isVectorTy() && "Val must be a scalar");
7061 
7062   if (Ty->isFloatingPointTy()) {
7063     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7064 
7065     // Floating point operations had to be 'fast' to enable the unrolling.
7066     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7067     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7068   }
7069   Constant *C = ConstantInt::get(Ty, StartIdx);
7070   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7071 }
7072 
7073 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7074   SmallVector<Metadata *, 4> MDs;
7075   // Reserve first location for self reference to the LoopID metadata node.
7076   MDs.push_back(nullptr);
7077   bool IsUnrollMetadata = false;
7078   MDNode *LoopID = L->getLoopID();
7079   if (LoopID) {
7080     // First find existing loop unrolling disable metadata.
7081     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7082       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7083       if (MD) {
7084         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7085         IsUnrollMetadata =
7086             S && S->getString().startswith("llvm.loop.unroll.disable");
7087       }
7088       MDs.push_back(LoopID->getOperand(i));
7089     }
7090   }
7091 
7092   if (!IsUnrollMetadata) {
7093     // Add runtime unroll disable metadata.
7094     LLVMContext &Context = L->getHeader()->getContext();
7095     SmallVector<Metadata *, 1> DisableOperands;
7096     DisableOperands.push_back(
7097         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7098     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7099     MDs.push_back(DisableNode);
7100     MDNode *NewLoopID = MDNode::get(Context, MDs);
7101     // Set operand 0 to refer to the loop id itself.
7102     NewLoopID->replaceOperandWith(0, NewLoopID);
7103     L->setLoopID(NewLoopID);
7104   }
7105 }
7106 
7107 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7108     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7109   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
7110   bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start));
7111 
7112   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
7113     if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) {
7114       Range.End = TmpVF;
7115       break;
7116     }
7117 
7118   return PredicateAtRangeStart;
7119 }
7120 
7121 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7122 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7123 /// of VF's starting at a given VF and extending it as much as possible. Each
7124 /// vectorization decision can potentially shorten this sub-range during
7125 /// buildVPlan().
7126 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
7127   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7128     VFRange SubRange = {VF, MaxVF + 1};
7129     VPlans.push_back(buildVPlan(SubRange));
7130     VF = SubRange.End;
7131   }
7132 }
7133 
7134 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7135                                          VPlanPtr &Plan) {
7136   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7137 
7138   // Look for cached value.
7139   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7140   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7141   if (ECEntryIt != EdgeMaskCache.end())
7142     return ECEntryIt->second;
7143 
7144   VPValue *SrcMask = createBlockInMask(Src, Plan);
7145 
7146   // The terminator has to be a branch inst!
7147   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7148   assert(BI && "Unexpected terminator found");
7149 
7150   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7151     return EdgeMaskCache[Edge] = SrcMask;
7152 
7153   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
7154   assert(EdgeMask && "No Edge Mask found for condition");
7155 
7156   if (BI->getSuccessor(0) != Dst)
7157     EdgeMask = Builder.createNot(EdgeMask);
7158 
7159   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7160     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7161 
7162   return EdgeMaskCache[Edge] = EdgeMask;
7163 }
7164 
7165 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7166   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7167 
7168   // Look for cached value.
7169   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7170   if (BCEntryIt != BlockMaskCache.end())
7171     return BCEntryIt->second;
7172 
7173   // All-one mask is modelled as no-mask following the convention for masked
7174   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7175   VPValue *BlockMask = nullptr;
7176 
7177   if (OrigLoop->getHeader() == BB) {
7178     if (!CM.blockNeedsPredication(BB))
7179       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7180 
7181     // Introduce the early-exit compare IV <= BTC to form header block mask.
7182     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7183     // Start by constructing the desired canonical IV.
7184     VPValue *IV = nullptr;
7185     if (Legal->getPrimaryInduction())
7186       IV = Plan->getVPValue(Legal->getPrimaryInduction());
7187     else {
7188       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7189       Builder.getInsertBlock()->appendRecipe(IVRecipe);
7190       IV = IVRecipe->getVPValue();
7191     }
7192     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7193     bool TailFolded = !CM.isScalarEpilogueAllowed();
7194 
7195     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7196       // While ActiveLaneMask is a binary op that consumes the loop tripcount
7197       // as a second argument, we only pass the IV here and extract the
7198       // tripcount from the transform state where codegen of the VP instructions
7199       // happen.
7200       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7201     } else {
7202       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7203     }
7204     return BlockMaskCache[BB] = BlockMask;
7205   }
7206 
7207   // This is the block mask. We OR all incoming edges.
7208   for (auto *Predecessor : predecessors(BB)) {
7209     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7210     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7211       return BlockMaskCache[BB] = EdgeMask;
7212 
7213     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7214       BlockMask = EdgeMask;
7215       continue;
7216     }
7217 
7218     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7219   }
7220 
7221   return BlockMaskCache[BB] = BlockMask;
7222 }
7223 
7224 VPWidenMemoryInstructionRecipe *
7225 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7226                                   VPlanPtr &Plan) {
7227   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7228          "Must be called with either a load or store");
7229 
7230   auto willWiden = [&](ElementCount VF) -> bool {
7231     assert(!VF.Scalable && "unexpected scalable ElementCount");
7232     if (VF.isScalar())
7233       return false;
7234     LoopVectorizationCostModel::InstWidening Decision =
7235         CM.getWideningDecision(I, VF);
7236     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7237            "CM decision should be taken at this point.");
7238     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7239       return true;
7240     if (CM.isScalarAfterVectorization(I, VF) ||
7241         CM.isProfitableToScalarize(I, VF))
7242       return false;
7243     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7244   };
7245 
7246   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7247     return nullptr;
7248 
7249   VPValue *Mask = nullptr;
7250   if (Legal->isMaskRequired(I))
7251     Mask = createBlockInMask(I->getParent(), Plan);
7252 
7253   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7254   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7255     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7256 
7257   StoreInst *Store = cast<StoreInst>(I);
7258   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7259   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7260 }
7261 
7262 VPWidenIntOrFpInductionRecipe *
7263 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
7264   // Check if this is an integer or fp induction. If so, build the recipe that
7265   // produces its scalar and vector values.
7266   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
7267   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
7268       II.getKind() == InductionDescriptor::IK_FpInduction)
7269     return new VPWidenIntOrFpInductionRecipe(Phi);
7270 
7271   return nullptr;
7272 }
7273 
7274 VPWidenIntOrFpInductionRecipe *
7275 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
7276                                                 VFRange &Range) const {
7277   // Optimize the special case where the source is a constant integer
7278   // induction variable. Notice that we can only optimize the 'trunc' case
7279   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7280   // (c) other casts depend on pointer size.
7281 
7282   // Determine whether \p K is a truncation based on an induction variable that
7283   // can be optimized.
7284   auto isOptimizableIVTruncate =
7285       [&](Instruction *K) -> std::function<bool(ElementCount)> {
7286     return [=](ElementCount VF) -> bool {
7287       return CM.isOptimizableIVTruncate(K, VF);
7288     };
7289   };
7290 
7291   if (LoopVectorizationPlanner::getDecisionAndClampRange(
7292           isOptimizableIVTruncate(I), Range))
7293     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
7294                                              I);
7295   return nullptr;
7296 }
7297 
7298 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
7299   // We know that all PHIs in non-header blocks are converted into selects, so
7300   // we don't have to worry about the insertion order and we can just use the
7301   // builder. At this point we generate the predication tree. There may be
7302   // duplications since this is a simple recursive scan, but future
7303   // optimizations will clean it up.
7304 
7305   SmallVector<VPValue *, 2> Operands;
7306   unsigned NumIncoming = Phi->getNumIncomingValues();
7307   for (unsigned In = 0; In < NumIncoming; In++) {
7308     VPValue *EdgeMask =
7309       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
7310     assert((EdgeMask || NumIncoming == 1) &&
7311            "Multiple predecessors with one having a full mask");
7312     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
7313     if (EdgeMask)
7314       Operands.push_back(EdgeMask);
7315   }
7316   return new VPBlendRecipe(Phi, Operands);
7317 }
7318 
7319 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
7320                                                    VPlan &Plan) const {
7321 
7322   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7323       [this, CI](ElementCount VF) {
7324         return CM.isScalarWithPredication(CI, VF);
7325       },
7326       Range);
7327 
7328   if (IsPredicated)
7329     return nullptr;
7330 
7331   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7332   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7333              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
7334     return nullptr;
7335 
7336   auto willWiden = [&](ElementCount VF) -> bool {
7337     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7338     // The following case may be scalarized depending on the VF.
7339     // The flag shows whether we use Intrinsic or a usual Call for vectorized
7340     // version of the instruction.
7341     // Is it beneficial to perform intrinsic call compared to lib call?
7342     bool NeedToScalarize = false;
7343     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
7344     bool UseVectorIntrinsic =
7345         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
7346     return UseVectorIntrinsic || !NeedToScalarize;
7347   };
7348 
7349   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7350     return nullptr;
7351 
7352   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
7353 }
7354 
7355 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7356   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7357          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7358   // Instruction should be widened, unless it is scalar after vectorization,
7359   // scalarization is profitable or it is predicated.
7360   auto WillScalarize = [this, I](ElementCount VF) -> bool {
7361     return CM.isScalarAfterVectorization(I, VF) ||
7362            CM.isProfitableToScalarize(I, VF) ||
7363            CM.isScalarWithPredication(I, VF);
7364   };
7365   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7366                                                              Range);
7367 }
7368 
7369 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
7370   auto IsVectorizableOpcode = [](unsigned Opcode) {
7371     switch (Opcode) {
7372     case Instruction::Add:
7373     case Instruction::And:
7374     case Instruction::AShr:
7375     case Instruction::BitCast:
7376     case Instruction::FAdd:
7377     case Instruction::FCmp:
7378     case Instruction::FDiv:
7379     case Instruction::FMul:
7380     case Instruction::FNeg:
7381     case Instruction::FPExt:
7382     case Instruction::FPToSI:
7383     case Instruction::FPToUI:
7384     case Instruction::FPTrunc:
7385     case Instruction::FRem:
7386     case Instruction::FSub:
7387     case Instruction::ICmp:
7388     case Instruction::IntToPtr:
7389     case Instruction::LShr:
7390     case Instruction::Mul:
7391     case Instruction::Or:
7392     case Instruction::PtrToInt:
7393     case Instruction::SDiv:
7394     case Instruction::Select:
7395     case Instruction::SExt:
7396     case Instruction::Shl:
7397     case Instruction::SIToFP:
7398     case Instruction::SRem:
7399     case Instruction::Sub:
7400     case Instruction::Trunc:
7401     case Instruction::UDiv:
7402     case Instruction::UIToFP:
7403     case Instruction::URem:
7404     case Instruction::Xor:
7405     case Instruction::ZExt:
7406       return true;
7407     }
7408     return false;
7409   };
7410 
7411   if (!IsVectorizableOpcode(I->getOpcode()))
7412     return nullptr;
7413 
7414   // Success: widen this instruction.
7415   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7416 }
7417 
7418 VPBasicBlock *VPRecipeBuilder::handleReplication(
7419     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7420     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7421     VPlanPtr &Plan) {
7422   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7423       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7424       Range);
7425 
7426   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7427       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
7428       Range);
7429 
7430   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
7431                                        IsUniform, IsPredicated);
7432   setRecipe(I, Recipe);
7433 
7434   // Find if I uses a predicated instruction. If so, it will use its scalar
7435   // value. Avoid hoisting the insert-element which packs the scalar value into
7436   // a vector value, as that happens iff all users use the vector value.
7437   for (auto &Op : I->operands())
7438     if (auto *PredInst = dyn_cast<Instruction>(Op))
7439       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7440         PredInst2Recipe[PredInst]->setAlsoPack(false);
7441 
7442   // Finalize the recipe for Instr, first if it is not predicated.
7443   if (!IsPredicated) {
7444     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7445     VPBB->appendRecipe(Recipe);
7446     return VPBB;
7447   }
7448   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7449   assert(VPBB->getSuccessors().empty() &&
7450          "VPBB has successors when handling predicated replication.");
7451   // Record predicated instructions for above packing optimizations.
7452   PredInst2Recipe[I] = Recipe;
7453   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7454   VPBlockUtils::insertBlockAfter(Region, VPBB);
7455   auto *RegSucc = new VPBasicBlock();
7456   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7457   return RegSucc;
7458 }
7459 
7460 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7461                                                       VPRecipeBase *PredRecipe,
7462                                                       VPlanPtr &Plan) {
7463   // Instructions marked for predication are replicated and placed under an
7464   // if-then construct to prevent side-effects.
7465 
7466   // Generate recipes to compute the block mask for this region.
7467   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7468 
7469   // Build the triangular if-then region.
7470   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7471   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7472   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7473   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7474   auto *PHIRecipe =
7475       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7476   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7477   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7478   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7479 
7480   // Note: first set Entry as region entry and then connect successors starting
7481   // from it in order, to propagate the "parent" of each VPBasicBlock.
7482   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7483   VPBlockUtils::connectBlocks(Pred, Exit);
7484 
7485   return Region;
7486 }
7487 
7488 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7489                                                       VFRange &Range,
7490                                                       VPlanPtr &Plan) {
7491   // First, check for specific widening recipes that deal with calls, memory
7492   // operations, inductions and Phi nodes.
7493   if (auto *CI = dyn_cast<CallInst>(Instr))
7494     return tryToWidenCall(CI, Range, *Plan);
7495 
7496   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7497     return tryToWidenMemory(Instr, Range, Plan);
7498 
7499   VPRecipeBase *Recipe;
7500   if (auto Phi = dyn_cast<PHINode>(Instr)) {
7501     if (Phi->getParent() != OrigLoop->getHeader())
7502       return tryToBlend(Phi, Plan);
7503     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7504       return Recipe;
7505     return new VPWidenPHIRecipe(Phi);
7506   }
7507 
7508   if (isa<TruncInst>(Instr) &&
7509       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7510     return Recipe;
7511 
7512   if (!shouldWiden(Instr, Range))
7513     return nullptr;
7514 
7515   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7516     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
7517                                 OrigLoop);
7518 
7519   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7520     bool InvariantCond =
7521         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7522     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
7523                                    InvariantCond);
7524   }
7525 
7526   return tryToWiden(Instr, *Plan);
7527 }
7528 
7529 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7530                                                         unsigned MaxVF) {
7531   assert(OrigLoop->empty() && "Inner loop expected.");
7532 
7533   // Collect conditions feeding internal conditional branches; they need to be
7534   // represented in VPlan for it to model masking.
7535   SmallPtrSet<Value *, 1> NeedDef;
7536 
7537   auto *Latch = OrigLoop->getLoopLatch();
7538   for (BasicBlock *BB : OrigLoop->blocks()) {
7539     if (BB == Latch)
7540       continue;
7541     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7542     if (Branch && Branch->isConditional())
7543       NeedDef.insert(Branch->getCondition());
7544   }
7545 
7546   // If the tail is to be folded by masking, the primary induction variable, if
7547   // exists needs to be represented in VPlan for it to model early-exit masking.
7548   // Also, both the Phi and the live-out instruction of each reduction are
7549   // required in order to introduce a select between them in VPlan.
7550   if (CM.foldTailByMasking()) {
7551     if (Legal->getPrimaryInduction())
7552       NeedDef.insert(Legal->getPrimaryInduction());
7553     for (auto &Reduction : Legal->getReductionVars()) {
7554       NeedDef.insert(Reduction.first);
7555       NeedDef.insert(Reduction.second.getLoopExitInstr());
7556     }
7557   }
7558 
7559   // Collect instructions from the original loop that will become trivially dead
7560   // in the vectorized loop. We don't need to vectorize these instructions. For
7561   // example, original induction update instructions can become dead because we
7562   // separately emit induction "steps" when generating code for the new loop.
7563   // Similarly, we create a new latch condition when setting up the structure
7564   // of the new loop, so the old one can become dead.
7565   SmallPtrSet<Instruction *, 4> DeadInstructions;
7566   collectTriviallyDeadInstructions(DeadInstructions);
7567 
7568   // Add assume instructions we need to drop to DeadInstructions, to prevent
7569   // them from being added to the VPlan.
7570   // TODO: We only need to drop assumes in blocks that get flattend. If the
7571   // control flow is preserved, we should keep them.
7572   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7573   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7574 
7575   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7576   // Dead instructions do not need sinking. Remove them from SinkAfter.
7577   for (Instruction *I : DeadInstructions)
7578     SinkAfter.erase(I);
7579 
7580   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7581     VFRange SubRange = {VF, MaxVF + 1};
7582     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7583                                              DeadInstructions, SinkAfter));
7584     VF = SubRange.End;
7585   }
7586 }
7587 
7588 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7589     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7590     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7591     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7592 
7593   // Hold a mapping from predicated instructions to their recipes, in order to
7594   // fix their AlsoPack behavior if a user is determined to replicate and use a
7595   // scalar instead of vector value.
7596   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7597 
7598   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7599 
7600   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7601 
7602   // ---------------------------------------------------------------------------
7603   // Pre-construction: record ingredients whose recipes we'll need to further
7604   // process after constructing the initial VPlan.
7605   // ---------------------------------------------------------------------------
7606 
7607   // Mark instructions we'll need to sink later and their targets as
7608   // ingredients whose recipe we'll need to record.
7609   for (auto &Entry : SinkAfter) {
7610     RecipeBuilder.recordRecipeOf(Entry.first);
7611     RecipeBuilder.recordRecipeOf(Entry.second);
7612   }
7613   for (auto &Reduction : CM.getInLoopReductionChains()) {
7614     PHINode *Phi = Reduction.first;
7615     RecurrenceDescriptor::RecurrenceKind Kind =
7616         Legal->getReductionVars()[Phi].getRecurrenceKind();
7617     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7618 
7619     RecipeBuilder.recordRecipeOf(Phi);
7620     for (auto &R : ReductionOperations) {
7621       RecipeBuilder.recordRecipeOf(R);
7622       // For min/max reducitons, where we have a pair of icmp/select, we also
7623       // need to record the ICmp recipe, so it can be removed later.
7624       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7625           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7626         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
7627       }
7628     }
7629   }
7630 
7631   // For each interleave group which is relevant for this (possibly trimmed)
7632   // Range, add it to the set of groups to be later applied to the VPlan and add
7633   // placeholders for its members' Recipes which we'll be replacing with a
7634   // single VPInterleaveRecipe.
7635   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7636     auto applyIG = [IG, this](ElementCount VF) -> bool {
7637       return (VF.isVector() && // Query is illegal for VF == 1
7638               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7639                   LoopVectorizationCostModel::CM_Interleave);
7640     };
7641     if (!getDecisionAndClampRange(applyIG, Range))
7642       continue;
7643     InterleaveGroups.insert(IG);
7644     for (unsigned i = 0; i < IG->getFactor(); i++)
7645       if (Instruction *Member = IG->getMember(i))
7646         RecipeBuilder.recordRecipeOf(Member);
7647   };
7648 
7649   // ---------------------------------------------------------------------------
7650   // Build initial VPlan: Scan the body of the loop in a topological order to
7651   // visit each basic block after having visited its predecessor basic blocks.
7652   // ---------------------------------------------------------------------------
7653 
7654   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7655   auto Plan = std::make_unique<VPlan>();
7656   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7657   Plan->setEntry(VPBB);
7658 
7659   // Represent values that will have defs inside VPlan.
7660   for (Value *V : NeedDef)
7661     Plan->addVPValue(V);
7662 
7663   // Scan the body of the loop in a topological order to visit each basic block
7664   // after having visited its predecessor basic blocks.
7665   LoopBlocksDFS DFS(OrigLoop);
7666   DFS.perform(LI);
7667 
7668   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7669     // Relevant instructions from basic block BB will be grouped into VPRecipe
7670     // ingredients and fill a new VPBasicBlock.
7671     unsigned VPBBsForBB = 0;
7672     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7673     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7674     VPBB = FirstVPBBForBB;
7675     Builder.setInsertPoint(VPBB);
7676 
7677     // Introduce each ingredient into VPlan.
7678     // TODO: Model and preserve debug instrinsics in VPlan.
7679     for (Instruction &I : BB->instructionsWithoutDebug()) {
7680       Instruction *Instr = &I;
7681 
7682       // First filter out irrelevant instructions, to ensure no recipes are
7683       // built for them.
7684       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
7685         continue;
7686 
7687       if (auto Recipe =
7688               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7689         RecipeBuilder.setRecipe(Instr, Recipe);
7690         VPBB->appendRecipe(Recipe);
7691         continue;
7692       }
7693 
7694       // Otherwise, if all widening options failed, Instruction is to be
7695       // replicated. This may create a successor for VPBB.
7696       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7697           Instr, Range, VPBB, PredInst2Recipe, Plan);
7698       if (NextVPBB != VPBB) {
7699         VPBB = NextVPBB;
7700         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7701                                     : "");
7702       }
7703     }
7704   }
7705 
7706   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7707   // may also be empty, such as the last one VPBB, reflecting original
7708   // basic-blocks with no recipes.
7709   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7710   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7711   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7712   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7713   delete PreEntry;
7714 
7715   // ---------------------------------------------------------------------------
7716   // Transform initial VPlan: Apply previously taken decisions, in order, to
7717   // bring the VPlan to its final state.
7718   // ---------------------------------------------------------------------------
7719 
7720   // Apply Sink-After legal constraints.
7721   for (auto &Entry : SinkAfter) {
7722     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7723     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7724     Sink->moveAfter(Target);
7725   }
7726 
7727   // Interleave memory: for each Interleave Group we marked earlier as relevant
7728   // for this VPlan, replace the Recipes widening its memory instructions with a
7729   // single VPInterleaveRecipe at its insertion point.
7730   for (auto IG : InterleaveGroups) {
7731     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7732         RecipeBuilder.getRecipe(IG->getInsertPos()));
7733     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7734         ->insertBefore(Recipe);
7735 
7736     for (unsigned i = 0; i < IG->getFactor(); ++i)
7737       if (Instruction *Member = IG->getMember(i)) {
7738         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7739       }
7740   }
7741 
7742   // Adjust the recipes for any inloop reductions.
7743   if (Range.Start > 1)
7744     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
7745 
7746   // Finally, if tail is folded by masking, introduce selects between the phi
7747   // and the live-out instruction of each reduction, at the end of the latch.
7748   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
7749     Builder.setInsertPoint(VPBB);
7750     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7751     for (auto &Reduction : Legal->getReductionVars()) {
7752       assert(!CM.isInLoopReduction(Reduction.first) &&
7753              "Didn't expect inloop tail folded reduction yet!");
7754       VPValue *Phi = Plan->getVPValue(Reduction.first);
7755       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7756       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7757     }
7758   }
7759 
7760   std::string PlanName;
7761   raw_string_ostream RSO(PlanName);
7762   ElementCount VF = ElementCount::getFixed(Range.Start);
7763   Plan->addVF(VF);
7764   RSO << "Initial VPlan for VF={" << VF;
7765   for (VF.Min *= 2; VF.Min < Range.End; VF.Min *= 2) {
7766     Plan->addVF(VF);
7767     RSO << "," << VF;
7768   }
7769   RSO << "},UF>=1";
7770   RSO.flush();
7771   Plan->setName(PlanName);
7772 
7773   return Plan;
7774 }
7775 
7776 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7777   // Outer loop handling: They may require CFG and instruction level
7778   // transformations before even evaluating whether vectorization is profitable.
7779   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7780   // the vectorization pipeline.
7781   assert(!OrigLoop->empty());
7782   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7783 
7784   // Create new empty VPlan
7785   auto Plan = std::make_unique<VPlan>();
7786 
7787   // Build hierarchical CFG
7788   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7789   HCFGBuilder.buildHierarchicalCFG();
7790 
7791   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7792     Plan->addVF(ElementCount::getFixed(VF));
7793 
7794   if (EnableVPlanPredication) {
7795     VPlanPredicator VPP(*Plan);
7796     VPP.predicate();
7797 
7798     // Avoid running transformation to recipes until masked code generation in
7799     // VPlan-native path is in place.
7800     return Plan;
7801   }
7802 
7803   SmallPtrSet<Instruction *, 1> DeadInstructions;
7804   VPlanTransforms::VPInstructionsToVPRecipes(
7805       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7806   return Plan;
7807 }
7808 
7809 // Adjust the recipes for any inloop reductions. The chain of instructions
7810 // leading from the loop exit instr to the phi need to be converted to
7811 // reductions, with one operand being vector and the other being the scalar
7812 // reduction chain.
7813 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
7814     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
7815   for (auto &Reduction : CM.getInLoopReductionChains()) {
7816     PHINode *Phi = Reduction.first;
7817     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
7818     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7819 
7820     // ReductionOperations are orders top-down from the phi's use to the
7821     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
7822     // which of the two operands will remain scalar and which will be reduced.
7823     // For minmax the chain will be the select instructions.
7824     Instruction *Chain = Phi;
7825     for (Instruction *R : ReductionOperations) {
7826       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
7827       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
7828 
7829       VPValue *ChainOp = Plan->getVPValue(Chain);
7830       unsigned FirstOpId;
7831       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7832           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7833         assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC &&
7834                "Expected to replace a VPWidenSelectSC");
7835         FirstOpId = 1;
7836       } else {
7837         assert(WidenRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&
7838                "Expected to replace a VPWidenSC");
7839         FirstOpId = 0;
7840       }
7841       unsigned VecOpId =
7842           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
7843       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
7844 
7845       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
7846           &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI);
7847       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
7848       WidenRecipe->eraseFromParent();
7849 
7850       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7851           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7852         VPRecipeBase *CompareRecipe =
7853             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
7854         assert(CompareRecipe->getVPRecipeID() == VPRecipeBase::VPWidenSC &&
7855                "Expected to replace a VPWidenSC");
7856         CompareRecipe->eraseFromParent();
7857       }
7858       Chain = R;
7859     }
7860   }
7861 }
7862 
7863 Value* LoopVectorizationPlanner::VPCallbackILV::
7864 getOrCreateVectorValues(Value *V, unsigned Part) {
7865       return ILV.getOrCreateVectorValue(V, Part);
7866 }
7867 
7868 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7869     Value *V, const VPIteration &Instance) {
7870   return ILV.getOrCreateScalarValue(V, Instance);
7871 }
7872 
7873 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7874                                VPSlotTracker &SlotTracker) const {
7875   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7876   IG->getInsertPos()->printAsOperand(O, false);
7877   O << ", ";
7878   getAddr()->printAsOperand(O, SlotTracker);
7879   VPValue *Mask = getMask();
7880   if (Mask) {
7881     O << ", ";
7882     Mask->printAsOperand(O, SlotTracker);
7883   }
7884   for (unsigned i = 0; i < IG->getFactor(); ++i)
7885     if (Instruction *I = IG->getMember(i))
7886       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
7887 }
7888 
7889 void VPWidenCallRecipe::execute(VPTransformState &State) {
7890   State.ILV->widenCallInstruction(Ingredient, User, State);
7891 }
7892 
7893 void VPWidenSelectRecipe::execute(VPTransformState &State) {
7894   State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State);
7895 }
7896 
7897 void VPWidenRecipe::execute(VPTransformState &State) {
7898   State.ILV->widenInstruction(Ingredient, User, State);
7899 }
7900 
7901 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7902   State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant,
7903                       IsIndexLoopInvariant, State);
7904 }
7905 
7906 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7907   assert(!State.Instance && "Int or FP induction being replicated.");
7908   State.ILV->widenIntOrFpInduction(IV, Trunc);
7909 }
7910 
7911 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7912   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7913 }
7914 
7915 void VPBlendRecipe::execute(VPTransformState &State) {
7916   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7917   // We know that all PHIs in non-header blocks are converted into
7918   // selects, so we don't have to worry about the insertion order and we
7919   // can just use the builder.
7920   // At this point we generate the predication tree. There may be
7921   // duplications since this is a simple recursive scan, but future
7922   // optimizations will clean it up.
7923 
7924   unsigned NumIncoming = getNumIncomingValues();
7925 
7926   // Generate a sequence of selects of the form:
7927   // SELECT(Mask3, In3,
7928   //        SELECT(Mask2, In2,
7929   //               SELECT(Mask1, In1,
7930   //                      In0)))
7931   // Note that Mask0 is never used: lanes for which no path reaches this phi and
7932   // are essentially undef are taken from In0.
7933   InnerLoopVectorizer::VectorParts Entry(State.UF);
7934   for (unsigned In = 0; In < NumIncoming; ++In) {
7935     for (unsigned Part = 0; Part < State.UF; ++Part) {
7936       // We might have single edge PHIs (blocks) - use an identity
7937       // 'select' for the first PHI operand.
7938       Value *In0 = State.get(getIncomingValue(In), Part);
7939       if (In == 0)
7940         Entry[Part] = In0; // Initialize with the first incoming value.
7941       else {
7942         // Select between the current value and the previous incoming edge
7943         // based on the incoming mask.
7944         Value *Cond = State.get(getMask(In), Part);
7945         Entry[Part] =
7946             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7947       }
7948     }
7949   }
7950   for (unsigned Part = 0; Part < State.UF; ++Part)
7951     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7952 }
7953 
7954 void VPInterleaveRecipe::execute(VPTransformState &State) {
7955   assert(!State.Instance && "Interleave group being replicated.");
7956   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
7957 }
7958 
7959 void VPReductionRecipe::execute(VPTransformState &State) {
7960   assert(!State.Instance && "Reduction being replicated.");
7961   for (unsigned Part = 0; Part < State.UF; ++Part) {
7962     unsigned Kind = RdxDesc->getRecurrenceKind();
7963     Value *NewVecOp = State.get(VecOp, Part);
7964     Value *NewRed =
7965         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
7966     Value *PrevInChain = State.get(ChainOp, Part);
7967     Value *NextInChain;
7968     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7969         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7970       NextInChain =
7971           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
7972                          NewRed, PrevInChain);
7973     } else {
7974       NextInChain = State.Builder.CreateBinOp(
7975           (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain);
7976     }
7977     State.ValueMap.setVectorValue(I, Part, NextInChain);
7978   }
7979 }
7980 
7981 void VPReplicateRecipe::execute(VPTransformState &State) {
7982   if (State.Instance) { // Generate a single instance.
7983     State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance,
7984                                     IsPredicated, State);
7985     // Insert scalar instance packing it into a vector.
7986     if (AlsoPack && State.VF.isVector()) {
7987       // If we're constructing lane 0, initialize to start from undef.
7988       if (State.Instance->Lane == 0) {
7989         assert(!State.VF.Scalable && "VF is assumed to be non scalable.");
7990         Value *Undef =
7991             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7992         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7993       }
7994       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7995     }
7996     return;
7997   }
7998 
7999   // Generate scalar instances for all VF lanes of all UF parts, unless the
8000   // instruction is uniform inwhich case generate only the first lane for each
8001   // of the UF parts.
8002   unsigned EndLane = IsUniform ? 1 : State.VF.Min;
8003   for (unsigned Part = 0; Part < State.UF; ++Part)
8004     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8005       State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane},
8006                                       IsPredicated, State);
8007 }
8008 
8009 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8010   assert(State.Instance && "Branch on Mask works only on single instance.");
8011 
8012   unsigned Part = State.Instance->Part;
8013   unsigned Lane = State.Instance->Lane;
8014 
8015   Value *ConditionBit = nullptr;
8016   VPValue *BlockInMask = getMask();
8017   if (BlockInMask) {
8018     ConditionBit = State.get(BlockInMask, Part);
8019     if (ConditionBit->getType()->isVectorTy())
8020       ConditionBit = State.Builder.CreateExtractElement(
8021           ConditionBit, State.Builder.getInt32(Lane));
8022   } else // Block in mask is all-one.
8023     ConditionBit = State.Builder.getTrue();
8024 
8025   // Replace the temporary unreachable terminator with a new conditional branch,
8026   // whose two destinations will be set later when they are created.
8027   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8028   assert(isa<UnreachableInst>(CurrentTerminator) &&
8029          "Expected to replace unreachable terminator with conditional branch.");
8030   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8031   CondBr->setSuccessor(0, nullptr);
8032   ReplaceInstWithInst(CurrentTerminator, CondBr);
8033 }
8034 
8035 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8036   assert(State.Instance && "Predicated instruction PHI works per instance.");
8037   Instruction *ScalarPredInst = cast<Instruction>(
8038       State.ValueMap.getScalarValue(PredInst, *State.Instance));
8039   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8040   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8041   assert(PredicatingBB && "Predicated block has no single predecessor.");
8042 
8043   // By current pack/unpack logic we need to generate only a single phi node: if
8044   // a vector value for the predicated instruction exists at this point it means
8045   // the instruction has vector users only, and a phi for the vector value is
8046   // needed. In this case the recipe of the predicated instruction is marked to
8047   // also do that packing, thereby "hoisting" the insert-element sequence.
8048   // Otherwise, a phi node for the scalar value is needed.
8049   unsigned Part = State.Instance->Part;
8050   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8051     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8052     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8053     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8054     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8055     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8056     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8057   } else {
8058     Type *PredInstType = PredInst->getType();
8059     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8060     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8061     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8062     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8063   }
8064 }
8065 
8066 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8067   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
8068   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
8069                                         getMask());
8070 }
8071 
8072 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8073 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8074 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8075 // for predication.
8076 static ScalarEpilogueLowering getScalarEpilogueLowering(
8077     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8078     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8079     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8080     LoopVectorizationLegality &LVL) {
8081   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8082   // don't look at hints or options, and don't request a scalar epilogue.
8083   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8084   // LoopAccessInfo (due to code dependency and not being able to reliably get
8085   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8086   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8087   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8088   // back to the old way and vectorize with versioning when forced. See D81345.)
8089   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8090                                                       PGSOQueryType::IRPass) &&
8091                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8092     return CM_ScalarEpilogueNotAllowedOptSize;
8093 
8094   bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() &&
8095                               !PreferPredicateOverEpilogue;
8096 
8097   // 2) Next, if disabling predication is requested on the command line, honour
8098   // this and request a scalar epilogue.
8099   if (PredicateOptDisabled)
8100     return CM_ScalarEpilogueAllowed;
8101 
8102   // 3) and 4) look if enabling predication is requested on the command line,
8103   // with a loop hint, or if the TTI hook indicates this is profitable, request
8104   // predication.
8105   if (PreferPredicateOverEpilogue ||
8106       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
8107       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8108                                         LVL.getLAI()) &&
8109        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
8110     return CM_ScalarEpilogueNotNeededUsePredicate;
8111 
8112   return CM_ScalarEpilogueAllowed;
8113 }
8114 
8115 // Process the loop in the VPlan-native vectorization path. This path builds
8116 // VPlan upfront in the vectorization pipeline, which allows to apply
8117 // VPlan-to-VPlan transformations from the very beginning without modifying the
8118 // input LLVM IR.
8119 static bool processLoopInVPlanNativePath(
8120     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8121     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8122     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8123     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8124     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8125 
8126   if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
8127     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8128     return false;
8129   }
8130   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8131   Function *F = L->getHeader()->getParent();
8132   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8133 
8134   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8135       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8136 
8137   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8138                                 &Hints, IAI);
8139   // Use the planner for outer loop vectorization.
8140   // TODO: CM is not used at this point inside the planner. Turn CM into an
8141   // optional argument if we don't need it in the future.
8142   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8143 
8144   // Get user vectorization factor.
8145   const unsigned UserVF = Hints.getWidth();
8146 
8147   // Plan how to best vectorize, return the best VF and its cost.
8148   const VectorizationFactor VF =
8149       LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF));
8150 
8151   // If we are stress testing VPlan builds, do not attempt to generate vector
8152   // code. Masked vector code generation support will follow soon.
8153   // Also, do not attempt to vectorize if no vector code will be produced.
8154   if (VPlanBuildStressTest || EnableVPlanPredication ||
8155       VectorizationFactor::Disabled() == VF)
8156     return false;
8157 
8158   LVP.setBestPlan(VF.Width, 1);
8159 
8160   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8161                          &CM, BFI, PSI);
8162   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8163                     << L->getHeader()->getParent()->getName() << "\"\n");
8164   LVP.executePlan(LB, DT);
8165 
8166   // Mark the loop as already vectorized to avoid vectorizing again.
8167   Hints.setAlreadyVectorized();
8168 
8169   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8170   return true;
8171 }
8172 
8173 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8174     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8175                                !EnableLoopInterleaving),
8176       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8177                               !EnableLoopVectorization) {}
8178 
8179 bool LoopVectorizePass::processLoop(Loop *L) {
8180   assert((EnableVPlanNativePath || L->empty()) &&
8181          "VPlan-native path is not enabled. Only process inner loops.");
8182 
8183 #ifndef NDEBUG
8184   const std::string DebugLocStr = getDebugLocString(L);
8185 #endif /* NDEBUG */
8186 
8187   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8188                     << L->getHeader()->getParent()->getName() << "\" from "
8189                     << DebugLocStr << "\n");
8190 
8191   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8192 
8193   LLVM_DEBUG(
8194       dbgs() << "LV: Loop hints:"
8195              << " force="
8196              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8197                      ? "disabled"
8198                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8199                             ? "enabled"
8200                             : "?"))
8201              << " width=" << Hints.getWidth()
8202              << " unroll=" << Hints.getInterleave() << "\n");
8203 
8204   // Function containing loop
8205   Function *F = L->getHeader()->getParent();
8206 
8207   // Looking at the diagnostic output is the only way to determine if a loop
8208   // was vectorized (other than looking at the IR or machine code), so it
8209   // is important to generate an optimization remark for each loop. Most of
8210   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8211   // generated as OptimizationRemark and OptimizationRemarkMissed are
8212   // less verbose reporting vectorized loops and unvectorized loops that may
8213   // benefit from vectorization, respectively.
8214 
8215   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8216     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8217     return false;
8218   }
8219 
8220   PredicatedScalarEvolution PSE(*SE, *L);
8221 
8222   // Check if it is legal to vectorize the loop.
8223   LoopVectorizationRequirements Requirements(*ORE);
8224   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
8225                                 &Requirements, &Hints, DB, AC, BFI, PSI);
8226   if (!LVL.canVectorize(EnableVPlanNativePath)) {
8227     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8228     Hints.emitRemarkWithHints();
8229     return false;
8230   }
8231 
8232   // Check the function attributes and profiles to find out if this function
8233   // should be optimized for size.
8234   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8235       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
8236 
8237   // Entrance to the VPlan-native vectorization path. Outer loops are processed
8238   // here. They may require CFG and instruction level transformations before
8239   // even evaluating whether vectorization is profitable. Since we cannot modify
8240   // the incoming IR, we need to build VPlan upfront in the vectorization
8241   // pipeline.
8242   if (!L->empty())
8243     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
8244                                         ORE, BFI, PSI, Hints);
8245 
8246   assert(L->empty() && "Inner loop expected.");
8247 
8248   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8249   // count by optimizing for size, to minimize overheads.
8250   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
8251   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
8252     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
8253                       << "This loop is worth vectorizing only if no scalar "
8254                       << "iteration overheads are incurred.");
8255     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
8256       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
8257     else {
8258       LLVM_DEBUG(dbgs() << "\n");
8259       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
8260     }
8261   }
8262 
8263   // Check the function attributes to see if implicit floats are allowed.
8264   // FIXME: This check doesn't seem possibly correct -- what if the loop is
8265   // an integer loop and the vector instructions selected are purely integer
8266   // vector instructions?
8267   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
8268     reportVectorizationFailure(
8269         "Can't vectorize when the NoImplicitFloat attribute is used",
8270         "loop not vectorized due to NoImplicitFloat attribute",
8271         "NoImplicitFloat", ORE, L);
8272     Hints.emitRemarkWithHints();
8273     return false;
8274   }
8275 
8276   // Check if the target supports potentially unsafe FP vectorization.
8277   // FIXME: Add a check for the type of safety issue (denormal, signaling)
8278   // for the target we're vectorizing for, to make sure none of the
8279   // additional fp-math flags can help.
8280   if (Hints.isPotentiallyUnsafe() &&
8281       TTI->isFPVectorizationPotentiallyUnsafe()) {
8282     reportVectorizationFailure(
8283         "Potentially unsafe FP op prevents vectorization",
8284         "loop not vectorized due to unsafe FP support.",
8285         "UnsafeFP", ORE, L);
8286     Hints.emitRemarkWithHints();
8287     return false;
8288   }
8289 
8290   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
8291   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
8292 
8293   // If an override option has been passed in for interleaved accesses, use it.
8294   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
8295     UseInterleaved = EnableInterleavedMemAccesses;
8296 
8297   // Analyze interleaved memory accesses.
8298   if (UseInterleaved) {
8299     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
8300   }
8301 
8302   // Use the cost model.
8303   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
8304                                 F, &Hints, IAI);
8305   CM.collectValuesToIgnore();
8306 
8307   // Use the planner for vectorization.
8308   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
8309 
8310   // Get user vectorization factor and interleave count.
8311   unsigned UserVF = Hints.getWidth();
8312   unsigned UserIC = Hints.getInterleave();
8313 
8314   // Plan how to best vectorize, return the best VF and its cost.
8315   Optional<VectorizationFactor> MaybeVF =
8316       LVP.plan(ElementCount::getFixed(UserVF), UserIC);
8317 
8318   VectorizationFactor VF = VectorizationFactor::Disabled();
8319   unsigned IC = 1;
8320 
8321   if (MaybeVF) {
8322     VF = *MaybeVF;
8323     // Select the interleave count.
8324     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
8325   }
8326 
8327   // Identify the diagnostic messages that should be produced.
8328   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8329   bool VectorizeLoop = true, InterleaveLoop = true;
8330   if (Requirements.doesNotMeet(F, L, Hints)) {
8331     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
8332                          "requirements.\n");
8333     Hints.emitRemarkWithHints();
8334     return false;
8335   }
8336 
8337   if (VF.Width == 1) {
8338     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8339     VecDiagMsg = std::make_pair(
8340         "VectorizationNotBeneficial",
8341         "the cost-model indicates that vectorization is not beneficial");
8342     VectorizeLoop = false;
8343   }
8344 
8345   if (!MaybeVF && UserIC > 1) {
8346     // Tell the user interleaving was avoided up-front, despite being explicitly
8347     // requested.
8348     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
8349                          "interleaving should be avoided up front\n");
8350     IntDiagMsg = std::make_pair(
8351         "InterleavingAvoided",
8352         "Ignoring UserIC, because interleaving was avoided up front");
8353     InterleaveLoop = false;
8354   } else if (IC == 1 && UserIC <= 1) {
8355     // Tell the user interleaving is not beneficial.
8356     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8357     IntDiagMsg = std::make_pair(
8358         "InterleavingNotBeneficial",
8359         "the cost-model indicates that interleaving is not beneficial");
8360     InterleaveLoop = false;
8361     if (UserIC == 1) {
8362       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8363       IntDiagMsg.second +=
8364           " and is explicitly disabled or interleave count is set to 1";
8365     }
8366   } else if (IC > 1 && UserIC == 1) {
8367     // Tell the user interleaving is beneficial, but it explicitly disabled.
8368     LLVM_DEBUG(
8369         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
8370     IntDiagMsg = std::make_pair(
8371         "InterleavingBeneficialButDisabled",
8372         "the cost-model indicates that interleaving is beneficial "
8373         "but is explicitly disabled or interleave count is set to 1");
8374     InterleaveLoop = false;
8375   }
8376 
8377   // Override IC if user provided an interleave count.
8378   IC = UserIC > 0 ? UserIC : IC;
8379 
8380   // Emit diagnostic messages, if any.
8381   const char *VAPassName = Hints.vectorizeAnalysisPassName();
8382   if (!VectorizeLoop && !InterleaveLoop) {
8383     // Do not vectorize or interleaving the loop.
8384     ORE->emit([&]() {
8385       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
8386                                       L->getStartLoc(), L->getHeader())
8387              << VecDiagMsg.second;
8388     });
8389     ORE->emit([&]() {
8390       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8391                                       L->getStartLoc(), L->getHeader())
8392              << IntDiagMsg.second;
8393     });
8394     return false;
8395   } else if (!VectorizeLoop && InterleaveLoop) {
8396     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8397     ORE->emit([&]() {
8398       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
8399                                         L->getStartLoc(), L->getHeader())
8400              << VecDiagMsg.second;
8401     });
8402   } else if (VectorizeLoop && !InterleaveLoop) {
8403     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8404                       << ") in " << DebugLocStr << '\n');
8405     ORE->emit([&]() {
8406       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8407                                         L->getStartLoc(), L->getHeader())
8408              << IntDiagMsg.second;
8409     });
8410   } else if (VectorizeLoop && InterleaveLoop) {
8411     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8412                       << ") in " << DebugLocStr << '\n');
8413     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8414   }
8415 
8416   LVP.setBestPlan(VF.Width, IC);
8417 
8418   using namespace ore;
8419   bool DisableRuntimeUnroll = false;
8420   MDNode *OrigLoopID = L->getLoopID();
8421 
8422   if (!VectorizeLoop) {
8423     assert(IC > 1 && "interleave count should not be 1 or 0");
8424     // If we decided that it is not legal to vectorize the loop, then
8425     // interleave it.
8426     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
8427                                BFI, PSI);
8428     LVP.executePlan(Unroller, DT);
8429 
8430     ORE->emit([&]() {
8431       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8432                                 L->getHeader())
8433              << "interleaved loop (interleaved count: "
8434              << NV("InterleaveCount", IC) << ")";
8435     });
8436   } else {
8437     // If we decided that it is *legal* to vectorize the loop, then do it.
8438     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
8439                            &LVL, &CM, BFI, PSI);
8440     LVP.executePlan(LB, DT);
8441     ++LoopsVectorized;
8442 
8443     // Add metadata to disable runtime unrolling a scalar loop when there are
8444     // no runtime checks about strides and memory. A scalar loop that is
8445     // rarely used is not worth unrolling.
8446     if (!LB.areSafetyChecksAdded())
8447       DisableRuntimeUnroll = true;
8448 
8449     // Report the vectorization decision.
8450     ORE->emit([&]() {
8451       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
8452                                 L->getHeader())
8453              << "vectorized loop (vectorization width: "
8454              << NV("VectorizationFactor", VF.Width)
8455              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
8456     });
8457   }
8458 
8459   Optional<MDNode *> RemainderLoopID =
8460       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
8461                                       LLVMLoopVectorizeFollowupEpilogue});
8462   if (RemainderLoopID.hasValue()) {
8463     L->setLoopID(RemainderLoopID.getValue());
8464   } else {
8465     if (DisableRuntimeUnroll)
8466       AddRuntimeUnrollDisableMetaData(L);
8467 
8468     // Mark the loop as already vectorized to avoid vectorizing again.
8469     Hints.setAlreadyVectorized();
8470   }
8471 
8472   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8473   return true;
8474 }
8475 
8476 LoopVectorizeResult LoopVectorizePass::runImpl(
8477     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8478     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8479     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
8480     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8481     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
8482   SE = &SE_;
8483   LI = &LI_;
8484   TTI = &TTI_;
8485   DT = &DT_;
8486   BFI = &BFI_;
8487   TLI = TLI_;
8488   AA = &AA_;
8489   AC = &AC_;
8490   GetLAA = &GetLAA_;
8491   DB = &DB_;
8492   ORE = &ORE_;
8493   PSI = PSI_;
8494 
8495   // Don't attempt if
8496   // 1. the target claims to have no vector registers, and
8497   // 2. interleaving won't help ILP.
8498   //
8499   // The second condition is necessary because, even if the target has no
8500   // vector registers, loop vectorization may still enable scalar
8501   // interleaving.
8502   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8503       TTI->getMaxInterleaveFactor(1) < 2)
8504     return LoopVectorizeResult(false, false);
8505 
8506   bool Changed = false, CFGChanged = false;
8507 
8508   // The vectorizer requires loops to be in simplified form.
8509   // Since simplification may add new inner loops, it has to run before the
8510   // legality and profitability checks. This means running the loop vectorizer
8511   // will simplify all loops, regardless of whether anything end up being
8512   // vectorized.
8513   for (auto &L : *LI)
8514     Changed |= CFGChanged |=
8515         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8516 
8517   // Build up a worklist of inner-loops to vectorize. This is necessary as
8518   // the act of vectorizing or partially unrolling a loop creates new loops
8519   // and can invalidate iterators across the loops.
8520   SmallVector<Loop *, 8> Worklist;
8521 
8522   for (Loop *L : *LI)
8523     collectSupportedLoops(*L, LI, ORE, Worklist);
8524 
8525   LoopsAnalyzed += Worklist.size();
8526 
8527   // Now walk the identified inner loops.
8528   while (!Worklist.empty()) {
8529     Loop *L = Worklist.pop_back_val();
8530 
8531     // For the inner loops we actually process, form LCSSA to simplify the
8532     // transform.
8533     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8534 
8535     Changed |= CFGChanged |= processLoop(L);
8536   }
8537 
8538   // Process each loop nest in the function.
8539   return LoopVectorizeResult(Changed, CFGChanged);
8540 }
8541 
8542 PreservedAnalyses LoopVectorizePass::run(Function &F,
8543                                          FunctionAnalysisManager &AM) {
8544     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8545     auto &LI = AM.getResult<LoopAnalysis>(F);
8546     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8547     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8548     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8549     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8550     auto &AA = AM.getResult<AAManager>(F);
8551     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8552     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8553     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8554     MemorySSA *MSSA = EnableMSSALoopDependency
8555                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8556                           : nullptr;
8557 
8558     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8559     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8560         [&](Loop &L) -> const LoopAccessInfo & {
8561       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
8562       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8563     };
8564     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8565     ProfileSummaryInfo *PSI =
8566         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8567     LoopVectorizeResult Result =
8568         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8569     if (!Result.MadeAnyChange)
8570       return PreservedAnalyses::all();
8571     PreservedAnalyses PA;
8572 
8573     // We currently do not preserve loopinfo/dominator analyses with outer loop
8574     // vectorization. Until this is addressed, mark these analyses as preserved
8575     // only for non-VPlan-native path.
8576     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8577     if (!EnableVPlanNativePath) {
8578       PA.preserve<LoopAnalysis>();
8579       PA.preserve<DominatorTreeAnalysis>();
8580     }
8581     PA.preserve<BasicAA>();
8582     PA.preserve<GlobalsAA>();
8583     if (!Result.MadeCFGChange)
8584       PA.preserveSet<CFGAnalyses>();
8585     return PA;
8586 }
8587