1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
182 // that predication is preferred, and this lists all options. I.e., the
183 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
184 // and predicate the instructions accordingly. If tail-folding fails, there are
185 // different fallback strategies depending on these values:
186 namespace PreferPredicateTy {
187   enum Option {
188     ScalarEpilogue = 0,
189     PredicateElseScalarEpilogue,
190     PredicateOrDontVectorize
191   };
192 } // namespace PreferPredicateTy
193 
194 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
195     "prefer-predicate-over-epilogue",
196     cl::init(PreferPredicateTy::ScalarEpilogue),
197     cl::Hidden,
198     cl::desc("Tail-folding and predication preferences over creating a scalar "
199              "epilogue loop."),
200     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
201                          "scalar-epilogue",
202                          "Don't tail-predicate loops, create scalar epilogue"),
203               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
204                          "predicate-else-scalar-epilogue",
205                          "prefer tail-folding, create scalar epilogue if tail "
206                          "folding fails."),
207               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
208                          "predicate-dont-vectorize",
209                          "prefers tail-folding, don't attempt vectorization if "
210                          "tail-folding fails.")));
211 
212 static cl::opt<bool> MaximizeBandwidth(
213     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
214     cl::desc("Maximize bandwidth when selecting vectorization factor which "
215              "will be determined by the smallest type in loop."));
216 
217 static cl::opt<bool> EnableInterleavedMemAccesses(
218     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
219     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
220 
221 /// An interleave-group may need masking if it resides in a block that needs
222 /// predication, or in order to mask away gaps.
223 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
224     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
225     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
226 
227 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
228     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
229     cl::desc("We don't interleave loops with a estimated constant trip count "
230              "below this number"));
231 
232 static cl::opt<unsigned> ForceTargetNumScalarRegs(
233     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
234     cl::desc("A flag that overrides the target's number of scalar registers."));
235 
236 static cl::opt<unsigned> ForceTargetNumVectorRegs(
237     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
238     cl::desc("A flag that overrides the target's number of vector registers."));
239 
240 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
241     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
242     cl::desc("A flag that overrides the target's max interleave factor for "
243              "scalar loops."));
244 
245 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
246     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
247     cl::desc("A flag that overrides the target's max interleave factor for "
248              "vectorized loops."));
249 
250 static cl::opt<unsigned> ForceTargetInstructionCost(
251     "force-target-instruction-cost", cl::init(0), cl::Hidden,
252     cl::desc("A flag that overrides the target's expected cost for "
253              "an instruction to a single constant value. Mostly "
254              "useful for getting consistent testing."));
255 
256 static cl::opt<unsigned> SmallLoopCost(
257     "small-loop-cost", cl::init(20), cl::Hidden,
258     cl::desc(
259         "The cost of a loop that is considered 'small' by the interleaver."));
260 
261 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
262     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
263     cl::desc("Enable the use of the block frequency analysis to access PGO "
264              "heuristics minimizing code growth in cold regions and being more "
265              "aggressive in hot regions."));
266 
267 // Runtime interleave loops for load/store throughput.
268 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
269     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
270     cl::desc(
271         "Enable runtime interleaving until load/store ports are saturated"));
272 
273 /// Interleave small loops with scalar reductions.
274 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
275     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
276     cl::desc("Enable interleaving for loops with small iteration counts that "
277              "contain scalar reductions to expose ILP."));
278 
279 /// The number of stores in a loop that are allowed to need predication.
280 static cl::opt<unsigned> NumberOfStoresToPredicate(
281     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
282     cl::desc("Max number of stores to be predicated behind an if."));
283 
284 static cl::opt<bool> EnableIndVarRegisterHeur(
285     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
286     cl::desc("Count the induction variable only once when interleaving"));
287 
288 static cl::opt<bool> EnableCondStoresVectorization(
289     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
290     cl::desc("Enable if predication of stores during vectorization."));
291 
292 static cl::opt<unsigned> MaxNestedScalarReductionIC(
293     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
294     cl::desc("The maximum interleave count to use when interleaving a scalar "
295              "reduction in a nested loop."));
296 
297 static cl::opt<bool>
298     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
299                            cl::Hidden,
300                            cl::desc("Prefer in-loop vector reductions, "
301                                     "overriding the targets preference."));
302 
303 static cl::opt<bool> PreferPredicatedReductionSelect(
304     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
305     cl::desc(
306         "Prefer predicating a reduction operation over an after loop select."));
307 
308 cl::opt<bool> EnableVPlanNativePath(
309     "enable-vplan-native-path", cl::init(false), cl::Hidden,
310     cl::desc("Enable VPlan-native vectorization path with "
311              "support for outer loop vectorization."));
312 
313 // FIXME: Remove this switch once we have divergence analysis. Currently we
314 // assume divergent non-backedge branches when this switch is true.
315 cl::opt<bool> EnableVPlanPredication(
316     "enable-vplan-predication", cl::init(false), cl::Hidden,
317     cl::desc("Enable VPlan-native vectorization path predicator with "
318              "support for outer loop vectorization."));
319 
320 // This flag enables the stress testing of the VPlan H-CFG construction in the
321 // VPlan-native vectorization path. It must be used in conjuction with
322 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
323 // verification of the H-CFGs built.
324 static cl::opt<bool> VPlanBuildStressTest(
325     "vplan-build-stress-test", cl::init(false), cl::Hidden,
326     cl::desc(
327         "Build VPlan for every supported loop nest in the function and bail "
328         "out right after the build (stress test the VPlan H-CFG construction "
329         "in the VPlan-native vectorization path)."));
330 
331 cl::opt<bool> llvm::EnableLoopInterleaving(
332     "interleave-loops", cl::init(true), cl::Hidden,
333     cl::desc("Enable loop interleaving in Loop vectorization passes"));
334 cl::opt<bool> llvm::EnableLoopVectorization(
335     "vectorize-loops", cl::init(true), cl::Hidden,
336     cl::desc("Run the Loop vectorization passes"));
337 
338 /// A helper function that returns the type of loaded or stored value.
339 static Type *getMemInstValueType(Value *I) {
340   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
341          "Expected Load or Store instruction");
342   if (auto *LI = dyn_cast<LoadInst>(I))
343     return LI->getType();
344   return cast<StoreInst>(I)->getValueOperand()->getType();
345 }
346 
347 /// A helper function that returns true if the given type is irregular. The
348 /// type is irregular if its allocated size doesn't equal the store size of an
349 /// element of the corresponding vector type at the given vectorization factor.
350 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
351   assert(!VF.isScalable() && "scalable vectors not yet supported.");
352   // Determine if an array of VF elements of type Ty is "bitcast compatible"
353   // with a <VF x Ty> vector.
354   if (VF.isVector()) {
355     auto *VectorTy = VectorType::get(Ty, VF);
356     return TypeSize::get(VF.getKnownMinValue() *
357                              DL.getTypeAllocSize(Ty).getFixedValue(),
358                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
359   }
360 
361   // If the vectorization factor is one, we just check if an array of type Ty
362   // requires padding between elements.
363   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
364 }
365 
366 /// A helper function that returns the reciprocal of the block probability of
367 /// predicated blocks. If we return X, we are assuming the predicated block
368 /// will execute once for every X iterations of the loop header.
369 ///
370 /// TODO: We should use actual block probability here, if available. Currently,
371 ///       we always assume predicated blocks have a 50% chance of executing.
372 static unsigned getReciprocalPredBlockProb() { return 2; }
373 
374 /// A helper function that adds a 'fast' flag to floating-point operations.
375 static Value *addFastMathFlag(Value *V) {
376   if (isa<FPMathOperator>(V))
377     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
378   return V;
379 }
380 
381 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
382   if (isa<FPMathOperator>(V))
383     cast<Instruction>(V)->setFastMathFlags(FMF);
384   return V;
385 }
386 
387 /// A helper function that returns an integer or floating-point constant with
388 /// value C.
389 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
390   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
391                            : ConstantFP::get(Ty, C);
392 }
393 
394 /// Returns "best known" trip count for the specified loop \p L as defined by
395 /// the following procedure:
396 ///   1) Returns exact trip count if it is known.
397 ///   2) Returns expected trip count according to profile data if any.
398 ///   3) Returns upper bound estimate if it is known.
399 ///   4) Returns None if all of the above failed.
400 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
401   // Check if exact trip count is known.
402   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
403     return ExpectedTC;
404 
405   // Check if there is an expected trip count available from profile data.
406   if (LoopVectorizeWithBlockFrequency)
407     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
408       return EstimatedTC;
409 
410   // Check if upper bound estimate is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
412     return ExpectedTC;
413 
414   return None;
415 }
416 
417 namespace llvm {
418 
419 /// InnerLoopVectorizer vectorizes loops which contain only one basic
420 /// block to a specified vectorization factor (VF).
421 /// This class performs the widening of scalars into vectors, or multiple
422 /// scalars. This class also implements the following features:
423 /// * It inserts an epilogue loop for handling loops that don't have iteration
424 ///   counts that are known to be a multiple of the vectorization factor.
425 /// * It handles the code generation for reduction variables.
426 /// * Scalarization (implementation using scalars) of un-vectorizable
427 ///   instructions.
428 /// InnerLoopVectorizer does not perform any vectorization-legality
429 /// checks, and relies on the caller to check for the different legality
430 /// aspects. The InnerLoopVectorizer relies on the
431 /// LoopVectorizationLegality class to provide information about the induction
432 /// and reduction variables that were found to a given vectorization factor.
433 class InnerLoopVectorizer {
434 public:
435   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
436                       LoopInfo *LI, DominatorTree *DT,
437                       const TargetLibraryInfo *TLI,
438                       const TargetTransformInfo *TTI, AssumptionCache *AC,
439                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
440                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
441                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
442                       ProfileSummaryInfo *PSI)
443       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
444         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
445         Builder(PSE.getSE()->getContext()),
446         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
447         BFI(BFI), PSI(PSI) {
448     // Query this against the original loop and save it here because the profile
449     // of the original loop header may change as the transformation happens.
450     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
451         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
452   }
453 
454   virtual ~InnerLoopVectorizer() = default;
455 
456   /// Create a new empty loop that will contain vectorized instructions later
457   /// on, while the old loop will be used as the scalar remainder. Control flow
458   /// is generated around the vectorized (and scalar epilogue) loops consisting
459   /// of various checks and bypasses. Return the pre-header block of the new
460   /// loop.
461   BasicBlock *createVectorizedLoopSkeleton();
462 
463   /// Widen a single instruction within the innermost loop.
464   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
465                         VPTransformState &State);
466 
467   /// Widen a single call instruction within the innermost loop.
468   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
469                             VPTransformState &State);
470 
471   /// Widen a single select instruction within the innermost loop.
472   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
473                               bool InvariantCond, VPTransformState &State);
474 
475   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
476   void fixVectorizedLoop();
477 
478   // Return true if any runtime check is added.
479   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
480 
481   /// A type for vectorized values in the new loop. Each value from the
482   /// original loop, when vectorized, is represented by UF vector values in the
483   /// new unrolled loop, where UF is the unroll factor.
484   using VectorParts = SmallVector<Value *, 2>;
485 
486   /// Vectorize a single GetElementPtrInst based on information gathered and
487   /// decisions taken during planning.
488   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
489                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
490                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
491 
492   /// Vectorize a single PHINode in a block. This method handles the induction
493   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
494   /// arbitrary length vectors.
495   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
496 
497   /// A helper function to scalarize a single Instruction in the innermost loop.
498   /// Generates a sequence of scalar instances for each lane between \p MinLane
499   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
500   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
501   /// Instr's operands.
502   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
503                             const VPIteration &Instance, bool IfPredicateInstr,
504                             VPTransformState &State);
505 
506   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
507   /// is provided, the integer induction variable will first be truncated to
508   /// the corresponding type.
509   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
510 
511   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
512   /// vector or scalar value on-demand if one is not yet available. When
513   /// vectorizing a loop, we visit the definition of an instruction before its
514   /// uses. When visiting the definition, we either vectorize or scalarize the
515   /// instruction, creating an entry for it in the corresponding map. (In some
516   /// cases, such as induction variables, we will create both vector and scalar
517   /// entries.) Then, as we encounter uses of the definition, we derive values
518   /// for each scalar or vector use unless such a value is already available.
519   /// For example, if we scalarize a definition and one of its uses is vector,
520   /// we build the required vector on-demand with an insertelement sequence
521   /// when visiting the use. Otherwise, if the use is scalar, we can use the
522   /// existing scalar definition.
523   ///
524   /// Return a value in the new loop corresponding to \p V from the original
525   /// loop at unroll index \p Part. If the value has already been vectorized,
526   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
527   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
528   /// a new vector value on-demand by inserting the scalar values into a vector
529   /// with an insertelement sequence. If the value has been neither vectorized
530   /// nor scalarized, it must be loop invariant, so we simply broadcast the
531   /// value into a vector.
532   Value *getOrCreateVectorValue(Value *V, unsigned Part);
533 
534   void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
535     VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
536   }
537 
538   /// Return a value in the new loop corresponding to \p V from the original
539   /// loop at unroll and vector indices \p Instance. If the value has been
540   /// vectorized but not scalarized, the necessary extractelement instruction
541   /// will be generated.
542   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
543 
544   /// Construct the vector value of a scalarized value \p V one lane at a time.
545   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
546 
547   /// Try to vectorize interleaved access group \p Group with the base address
548   /// given in \p Addr, optionally masking the vector operations if \p
549   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
550   /// values in the vectorized loop.
551   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
552                                 VPTransformState &State, VPValue *Addr,
553                                 ArrayRef<VPValue *> StoredValues,
554                                 VPValue *BlockInMask = nullptr);
555 
556   /// Vectorize Load and Store instructions with the base address given in \p
557   /// Addr, optionally masking the vector operations if \p BlockInMask is
558   /// non-null. Use \p State to translate given VPValues to IR values in the
559   /// vectorized loop.
560   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
561                                   VPValue *Def, VPValue *Addr,
562                                   VPValue *StoredValue, VPValue *BlockInMask);
563 
564   /// Set the debug location in the builder using the debug location in
565   /// the instruction.
566   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
567 
568   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
569   void fixNonInductionPHIs(void);
570 
571 protected:
572   friend class LoopVectorizationPlanner;
573 
574   /// A small list of PHINodes.
575   using PhiVector = SmallVector<PHINode *, 4>;
576 
577   /// A type for scalarized values in the new loop. Each value from the
578   /// original loop, when scalarized, is represented by UF x VF scalar values
579   /// in the new unrolled loop, where UF is the unroll factor and VF is the
580   /// vectorization factor.
581   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
582 
583   /// Set up the values of the IVs correctly when exiting the vector loop.
584   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
585                     Value *CountRoundDown, Value *EndValue,
586                     BasicBlock *MiddleBlock);
587 
588   /// Create a new induction variable inside L.
589   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
590                                    Value *Step, Instruction *DL);
591 
592   /// Handle all cross-iteration phis in the header.
593   void fixCrossIterationPHIs();
594 
595   /// Fix a first-order recurrence. This is the second phase of vectorizing
596   /// this phi node.
597   void fixFirstOrderRecurrence(PHINode *Phi);
598 
599   /// Fix a reduction cross-iteration phi. This is the second phase of
600   /// vectorizing this phi node.
601   void fixReduction(PHINode *Phi);
602 
603   /// Clear NSW/NUW flags from reduction instructions if necessary.
604   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
605 
606   /// The Loop exit block may have single value PHI nodes with some
607   /// incoming value. While vectorizing we only handled real values
608   /// that were defined inside the loop and we should have one value for
609   /// each predecessor of its parent basic block. See PR14725.
610   void fixLCSSAPHIs();
611 
612   /// Iteratively sink the scalarized operands of a predicated instruction into
613   /// the block that was created for it.
614   void sinkScalarOperands(Instruction *PredInst);
615 
616   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
617   /// represented as.
618   void truncateToMinimalBitwidths();
619 
620   /// Create a broadcast instruction. This method generates a broadcast
621   /// instruction (shuffle) for loop invariant values and for the induction
622   /// value. If this is the induction variable then we extend it to N, N+1, ...
623   /// this is needed because each iteration in the loop corresponds to a SIMD
624   /// element.
625   virtual Value *getBroadcastInstrs(Value *V);
626 
627   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
628   /// to each vector element of Val. The sequence starts at StartIndex.
629   /// \p Opcode is relevant for FP induction variable.
630   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
631                                Instruction::BinaryOps Opcode =
632                                Instruction::BinaryOpsEnd);
633 
634   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
635   /// variable on which to base the steps, \p Step is the size of the step, and
636   /// \p EntryVal is the value from the original loop that maps to the steps.
637   /// Note that \p EntryVal doesn't have to be an induction variable - it
638   /// can also be a truncate instruction.
639   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
640                         const InductionDescriptor &ID);
641 
642   /// Create a vector induction phi node based on an existing scalar one. \p
643   /// EntryVal is the value from the original loop that maps to the vector phi
644   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
645   /// truncate instruction, instead of widening the original IV, we widen a
646   /// version of the IV truncated to \p EntryVal's type.
647   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
648                                        Value *Step, Instruction *EntryVal);
649 
650   /// Returns true if an instruction \p I should be scalarized instead of
651   /// vectorized for the chosen vectorization factor.
652   bool shouldScalarizeInstruction(Instruction *I) const;
653 
654   /// Returns true if we should generate a scalar version of \p IV.
655   bool needsScalarInduction(Instruction *IV) const;
656 
657   /// If there is a cast involved in the induction variable \p ID, which should
658   /// be ignored in the vectorized loop body, this function records the
659   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
660   /// cast. We had already proved that the casted Phi is equal to the uncasted
661   /// Phi in the vectorized loop (under a runtime guard), and therefore
662   /// there is no need to vectorize the cast - the same value can be used in the
663   /// vector loop for both the Phi and the cast.
664   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
665   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
666   ///
667   /// \p EntryVal is the value from the original loop that maps to the vector
668   /// phi node and is used to distinguish what is the IV currently being
669   /// processed - original one (if \p EntryVal is a phi corresponding to the
670   /// original IV) or the "newly-created" one based on the proof mentioned above
671   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
672   /// latter case \p EntryVal is a TruncInst and we must not record anything for
673   /// that IV, but it's error-prone to expect callers of this routine to care
674   /// about that, hence this explicit parameter.
675   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
676                                              const Instruction *EntryVal,
677                                              Value *VectorLoopValue,
678                                              unsigned Part,
679                                              unsigned Lane = UINT_MAX);
680 
681   /// Generate a shuffle sequence that will reverse the vector Vec.
682   virtual Value *reverseVector(Value *Vec);
683 
684   /// Returns (and creates if needed) the original loop trip count.
685   Value *getOrCreateTripCount(Loop *NewLoop);
686 
687   /// Returns (and creates if needed) the trip count of the widened loop.
688   Value *getOrCreateVectorTripCount(Loop *NewLoop);
689 
690   /// Returns a bitcasted value to the requested vector type.
691   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
692   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
693                                 const DataLayout &DL);
694 
695   /// Emit a bypass check to see if the vector trip count is zero, including if
696   /// it overflows.
697   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
698 
699   /// Emit a bypass check to see if all of the SCEV assumptions we've
700   /// had to make are correct.
701   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
702 
703   /// Emit bypass checks to check any memory assumptions we may have made.
704   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
705 
706   /// Compute the transformed value of Index at offset StartValue using step
707   /// StepValue.
708   /// For integer induction, returns StartValue + Index * StepValue.
709   /// For pointer induction, returns StartValue[Index * StepValue].
710   /// FIXME: The newly created binary instructions should contain nsw/nuw
711   /// flags, which can be found from the original scalar operations.
712   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
713                               const DataLayout &DL,
714                               const InductionDescriptor &ID) const;
715 
716   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
717   /// vector loop preheader, middle block and scalar preheader. Also
718   /// allocate a loop object for the new vector loop and return it.
719   Loop *createVectorLoopSkeleton(StringRef Prefix);
720 
721   /// Create new phi nodes for the induction variables to resume iteration count
722   /// in the scalar epilogue, from where the vectorized loop left off (given by
723   /// \p VectorTripCount).
724   void createInductionResumeValues(Loop *L, Value *VectorTripCount);
725 
726   /// Complete the loop skeleton by adding debug MDs, creating appropriate
727   /// conditional branches in the middle block, preparing the builder and
728   /// running the verifier. Take in the vector loop \p L as argument, and return
729   /// the preheader of the completed vector loop.
730   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
731 
732   /// Add additional metadata to \p To that was not present on \p Orig.
733   ///
734   /// Currently this is used to add the noalias annotations based on the
735   /// inserted memchecks.  Use this for instructions that are *cloned* into the
736   /// vector loop.
737   void addNewMetadata(Instruction *To, const Instruction *Orig);
738 
739   /// Add metadata from one instruction to another.
740   ///
741   /// This includes both the original MDs from \p From and additional ones (\see
742   /// addNewMetadata).  Use this for *newly created* instructions in the vector
743   /// loop.
744   void addMetadata(Instruction *To, Instruction *From);
745 
746   /// Similar to the previous function but it adds the metadata to a
747   /// vector of instructions.
748   void addMetadata(ArrayRef<Value *> To, Instruction *From);
749 
750   /// The original loop.
751   Loop *OrigLoop;
752 
753   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
754   /// dynamic knowledge to simplify SCEV expressions and converts them to a
755   /// more usable form.
756   PredicatedScalarEvolution &PSE;
757 
758   /// Loop Info.
759   LoopInfo *LI;
760 
761   /// Dominator Tree.
762   DominatorTree *DT;
763 
764   /// Alias Analysis.
765   AAResults *AA;
766 
767   /// Target Library Info.
768   const TargetLibraryInfo *TLI;
769 
770   /// Target Transform Info.
771   const TargetTransformInfo *TTI;
772 
773   /// Assumption Cache.
774   AssumptionCache *AC;
775 
776   /// Interface to emit optimization remarks.
777   OptimizationRemarkEmitter *ORE;
778 
779   /// LoopVersioning.  It's only set up (non-null) if memchecks were
780   /// used.
781   ///
782   /// This is currently only used to add no-alias metadata based on the
783   /// memchecks.  The actually versioning is performed manually.
784   std::unique_ptr<LoopVersioning> LVer;
785 
786   /// The vectorization SIMD factor to use. Each vector will have this many
787   /// vector elements.
788   ElementCount VF;
789 
790   /// The vectorization unroll factor to use. Each scalar is vectorized to this
791   /// many different vector instructions.
792   unsigned UF;
793 
794   /// The builder that we use
795   IRBuilder<> Builder;
796 
797   // --- Vectorization state ---
798 
799   /// The vector-loop preheader.
800   BasicBlock *LoopVectorPreHeader;
801 
802   /// The scalar-loop preheader.
803   BasicBlock *LoopScalarPreHeader;
804 
805   /// Middle Block between the vector and the scalar.
806   BasicBlock *LoopMiddleBlock;
807 
808   /// The ExitBlock of the scalar loop.
809   BasicBlock *LoopExitBlock;
810 
811   /// The vector loop body.
812   BasicBlock *LoopVectorBody;
813 
814   /// The scalar loop body.
815   BasicBlock *LoopScalarBody;
816 
817   /// A list of all bypass blocks. The first block is the entry of the loop.
818   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
819 
820   /// The new Induction variable which was added to the new block.
821   PHINode *Induction = nullptr;
822 
823   /// The induction variable of the old basic block.
824   PHINode *OldInduction = nullptr;
825 
826   /// Maps values from the original loop to their corresponding values in the
827   /// vectorized loop. A key value can map to either vector values, scalar
828   /// values or both kinds of values, depending on whether the key was
829   /// vectorized and scalarized.
830   VectorizerValueMap VectorLoopValueMap;
831 
832   /// Store instructions that were predicated.
833   SmallVector<Instruction *, 4> PredicatedInstructions;
834 
835   /// Trip count of the original loop.
836   Value *TripCount = nullptr;
837 
838   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
839   Value *VectorTripCount = nullptr;
840 
841   /// The legality analysis.
842   LoopVectorizationLegality *Legal;
843 
844   /// The profitablity analysis.
845   LoopVectorizationCostModel *Cost;
846 
847   // Record whether runtime checks are added.
848   bool AddedSafetyChecks = false;
849 
850   // Holds the end values for each induction variable. We save the end values
851   // so we can later fix-up the external users of the induction variables.
852   DenseMap<PHINode *, Value *> IVEndValues;
853 
854   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
855   // fixed up at the end of vector code generation.
856   SmallVector<PHINode *, 8> OrigPHIsToFix;
857 
858   /// BFI and PSI are used to check for profile guided size optimizations.
859   BlockFrequencyInfo *BFI;
860   ProfileSummaryInfo *PSI;
861 
862   // Whether this loop should be optimized for size based on profile guided size
863   // optimizatios.
864   bool OptForSizeBasedOnProfile;
865 };
866 
867 class InnerLoopUnroller : public InnerLoopVectorizer {
868 public:
869   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
870                     LoopInfo *LI, DominatorTree *DT,
871                     const TargetLibraryInfo *TLI,
872                     const TargetTransformInfo *TTI, AssumptionCache *AC,
873                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
874                     LoopVectorizationLegality *LVL,
875                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
876                     ProfileSummaryInfo *PSI)
877       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
878                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
879                             BFI, PSI) {}
880 
881 private:
882   Value *getBroadcastInstrs(Value *V) override;
883   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
884                        Instruction::BinaryOps Opcode =
885                        Instruction::BinaryOpsEnd) override;
886   Value *reverseVector(Value *Vec) override;
887 };
888 
889 } // end namespace llvm
890 
891 /// Look for a meaningful debug location on the instruction or it's
892 /// operands.
893 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
894   if (!I)
895     return I;
896 
897   DebugLoc Empty;
898   if (I->getDebugLoc() != Empty)
899     return I;
900 
901   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
902     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
903       if (OpInst->getDebugLoc() != Empty)
904         return OpInst;
905   }
906 
907   return I;
908 }
909 
910 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
911   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
912     const DILocation *DIL = Inst->getDebugLoc();
913     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
914         !isa<DbgInfoIntrinsic>(Inst)) {
915       assert(!VF.isScalable() && "scalable vectors not yet supported.");
916       auto NewDIL =
917           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
918       if (NewDIL)
919         B.SetCurrentDebugLocation(NewDIL.getValue());
920       else
921         LLVM_DEBUG(dbgs()
922                    << "Failed to create new discriminator: "
923                    << DIL->getFilename() << " Line: " << DIL->getLine());
924     }
925     else
926       B.SetCurrentDebugLocation(DIL);
927   } else
928     B.SetCurrentDebugLocation(DebugLoc());
929 }
930 
931 /// Write a record \p DebugMsg about vectorization failure to the debug
932 /// output stream. If \p I is passed, it is an instruction that prevents
933 /// vectorization.
934 #ifndef NDEBUG
935 static void debugVectorizationFailure(const StringRef DebugMsg,
936     Instruction *I) {
937   dbgs() << "LV: Not vectorizing: " << DebugMsg;
938   if (I != nullptr)
939     dbgs() << " " << *I;
940   else
941     dbgs() << '.';
942   dbgs() << '\n';
943 }
944 #endif
945 
946 /// Create an analysis remark that explains why vectorization failed
947 ///
948 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
949 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
950 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
951 /// the location of the remark.  \return the remark object that can be
952 /// streamed to.
953 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
954     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
955   Value *CodeRegion = TheLoop->getHeader();
956   DebugLoc DL = TheLoop->getStartLoc();
957 
958   if (I) {
959     CodeRegion = I->getParent();
960     // If there is no debug location attached to the instruction, revert back to
961     // using the loop's.
962     if (I->getDebugLoc())
963       DL = I->getDebugLoc();
964   }
965 
966   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
967   R << "loop not vectorized: ";
968   return R;
969 }
970 
971 namespace llvm {
972 
973 void reportVectorizationFailure(const StringRef DebugMsg,
974     const StringRef OREMsg, const StringRef ORETag,
975     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
976   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
977   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
978   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
979                 ORETag, TheLoop, I) << OREMsg);
980 }
981 
982 } // end namespace llvm
983 
984 #ifndef NDEBUG
985 /// \return string containing a file name and a line # for the given loop.
986 static std::string getDebugLocString(const Loop *L) {
987   std::string Result;
988   if (L) {
989     raw_string_ostream OS(Result);
990     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
991       LoopDbgLoc.print(OS);
992     else
993       // Just print the module name.
994       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
995     OS.flush();
996   }
997   return Result;
998 }
999 #endif
1000 
1001 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1002                                          const Instruction *Orig) {
1003   // If the loop was versioned with memchecks, add the corresponding no-alias
1004   // metadata.
1005   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1006     LVer->annotateInstWithNoAlias(To, Orig);
1007 }
1008 
1009 void InnerLoopVectorizer::addMetadata(Instruction *To,
1010                                       Instruction *From) {
1011   propagateMetadata(To, From);
1012   addNewMetadata(To, From);
1013 }
1014 
1015 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1016                                       Instruction *From) {
1017   for (Value *V : To) {
1018     if (Instruction *I = dyn_cast<Instruction>(V))
1019       addMetadata(I, From);
1020   }
1021 }
1022 
1023 namespace llvm {
1024 
1025 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1026 // lowered.
1027 enum ScalarEpilogueLowering {
1028 
1029   // The default: allowing scalar epilogues.
1030   CM_ScalarEpilogueAllowed,
1031 
1032   // Vectorization with OptForSize: don't allow epilogues.
1033   CM_ScalarEpilogueNotAllowedOptSize,
1034 
1035   // A special case of vectorisation with OptForSize: loops with a very small
1036   // trip count are considered for vectorization under OptForSize, thereby
1037   // making sure the cost of their loop body is dominant, free of runtime
1038   // guards and scalar iteration overheads.
1039   CM_ScalarEpilogueNotAllowedLowTripLoop,
1040 
1041   // Loop hint predicate indicating an epilogue is undesired.
1042   CM_ScalarEpilogueNotNeededUsePredicate
1043 };
1044 
1045 /// LoopVectorizationCostModel - estimates the expected speedups due to
1046 /// vectorization.
1047 /// In many cases vectorization is not profitable. This can happen because of
1048 /// a number of reasons. In this class we mainly attempt to predict the
1049 /// expected speedup/slowdowns due to the supported instruction set. We use the
1050 /// TargetTransformInfo to query the different backends for the cost of
1051 /// different operations.
1052 class LoopVectorizationCostModel {
1053 public:
1054   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1055                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1056                              LoopVectorizationLegality *Legal,
1057                              const TargetTransformInfo &TTI,
1058                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1059                              AssumptionCache *AC,
1060                              OptimizationRemarkEmitter *ORE, const Function *F,
1061                              const LoopVectorizeHints *Hints,
1062                              InterleavedAccessInfo &IAI)
1063       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1064         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1065         Hints(Hints), InterleaveInfo(IAI) {}
1066 
1067   /// \return An upper bound for the vectorization factor, or None if
1068   /// vectorization and interleaving should be avoided up front.
1069   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1070 
1071   /// \return True if runtime checks are required for vectorization, and false
1072   /// otherwise.
1073   bool runtimeChecksRequired();
1074 
1075   /// \return The most profitable vectorization factor and the cost of that VF.
1076   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1077   /// then this vectorization factor will be selected if vectorization is
1078   /// possible.
1079   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1080 
1081   /// Setup cost-based decisions for user vectorization factor.
1082   void selectUserVectorizationFactor(ElementCount UserVF) {
1083     collectUniformsAndScalars(UserVF);
1084     collectInstsToScalarize(UserVF);
1085   }
1086 
1087   /// \return The size (in bits) of the smallest and widest types in the code
1088   /// that needs to be vectorized. We ignore values that remain scalar such as
1089   /// 64 bit loop indices.
1090   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1091 
1092   /// \return The desired interleave count.
1093   /// If interleave count has been specified by metadata it will be returned.
1094   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1095   /// are the selected vectorization factor and the cost of the selected VF.
1096   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1097 
1098   /// Memory access instruction may be vectorized in more than one way.
1099   /// Form of instruction after vectorization depends on cost.
1100   /// This function takes cost-based decisions for Load/Store instructions
1101   /// and collects them in a map. This decisions map is used for building
1102   /// the lists of loop-uniform and loop-scalar instructions.
1103   /// The calculated cost is saved with widening decision in order to
1104   /// avoid redundant calculations.
1105   void setCostBasedWideningDecision(ElementCount VF);
1106 
1107   /// A struct that represents some properties of the register usage
1108   /// of a loop.
1109   struct RegisterUsage {
1110     /// Holds the number of loop invariant values that are used in the loop.
1111     /// The key is ClassID of target-provided register class.
1112     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1113     /// Holds the maximum number of concurrent live intervals in the loop.
1114     /// The key is ClassID of target-provided register class.
1115     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1116   };
1117 
1118   /// \return Returns information about the register usages of the loop for the
1119   /// given vectorization factors.
1120   SmallVector<RegisterUsage, 8>
1121   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1122 
1123   /// Collect values we want to ignore in the cost model.
1124   void collectValuesToIgnore();
1125 
1126   /// Split reductions into those that happen in the loop, and those that happen
1127   /// outside. In loop reductions are collected into InLoopReductionChains.
1128   void collectInLoopReductions();
1129 
1130   /// \returns The smallest bitwidth each instruction can be represented with.
1131   /// The vector equivalents of these instructions should be truncated to this
1132   /// type.
1133   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1134     return MinBWs;
1135   }
1136 
1137   /// \returns True if it is more profitable to scalarize instruction \p I for
1138   /// vectorization factor \p VF.
1139   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1140     assert(VF.isVector() &&
1141            "Profitable to scalarize relevant only for VF > 1.");
1142 
1143     // Cost model is not run in the VPlan-native path - return conservative
1144     // result until this changes.
1145     if (EnableVPlanNativePath)
1146       return false;
1147 
1148     auto Scalars = InstsToScalarize.find(VF);
1149     assert(Scalars != InstsToScalarize.end() &&
1150            "VF not yet analyzed for scalarization profitability");
1151     return Scalars->second.find(I) != Scalars->second.end();
1152   }
1153 
1154   /// Returns true if \p I is known to be uniform after vectorization.
1155   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1156     if (VF.isScalar())
1157       return true;
1158 
1159     // Cost model is not run in the VPlan-native path - return conservative
1160     // result until this changes.
1161     if (EnableVPlanNativePath)
1162       return false;
1163 
1164     auto UniformsPerVF = Uniforms.find(VF);
1165     assert(UniformsPerVF != Uniforms.end() &&
1166            "VF not yet analyzed for uniformity");
1167     return UniformsPerVF->second.count(I);
1168   }
1169 
1170   /// Returns true if \p I is known to be scalar after vectorization.
1171   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1172     if (VF.isScalar())
1173       return true;
1174 
1175     // Cost model is not run in the VPlan-native path - return conservative
1176     // result until this changes.
1177     if (EnableVPlanNativePath)
1178       return false;
1179 
1180     auto ScalarsPerVF = Scalars.find(VF);
1181     assert(ScalarsPerVF != Scalars.end() &&
1182            "Scalar values are not calculated for VF");
1183     return ScalarsPerVF->second.count(I);
1184   }
1185 
1186   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1187   /// for vectorization factor \p VF.
1188   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1189     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1190            !isProfitableToScalarize(I, VF) &&
1191            !isScalarAfterVectorization(I, VF);
1192   }
1193 
1194   /// Decision that was taken during cost calculation for memory instruction.
1195   enum InstWidening {
1196     CM_Unknown,
1197     CM_Widen,         // For consecutive accesses with stride +1.
1198     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1199     CM_Interleave,
1200     CM_GatherScatter,
1201     CM_Scalarize
1202   };
1203 
1204   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1205   /// instruction \p I and vector width \p VF.
1206   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1207                            unsigned Cost) {
1208     assert(VF.isVector() && "Expected VF >=2");
1209     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1210   }
1211 
1212   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1213   /// interleaving group \p Grp and vector width \p VF.
1214   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1215                            ElementCount VF, InstWidening W, unsigned Cost) {
1216     assert(VF.isVector() && "Expected VF >=2");
1217     /// Broadcast this decicion to all instructions inside the group.
1218     /// But the cost will be assigned to one instruction only.
1219     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1220       if (auto *I = Grp->getMember(i)) {
1221         if (Grp->getInsertPos() == I)
1222           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1223         else
1224           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1225       }
1226     }
1227   }
1228 
1229   /// Return the cost model decision for the given instruction \p I and vector
1230   /// width \p VF. Return CM_Unknown if this instruction did not pass
1231   /// through the cost modeling.
1232   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1233     assert(!VF.isScalable() && "scalable vectors not yet supported.");
1234     assert(VF.isVector() && "Expected VF >=2");
1235 
1236     // Cost model is not run in the VPlan-native path - return conservative
1237     // result until this changes.
1238     if (EnableVPlanNativePath)
1239       return CM_GatherScatter;
1240 
1241     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1242     auto Itr = WideningDecisions.find(InstOnVF);
1243     if (Itr == WideningDecisions.end())
1244       return CM_Unknown;
1245     return Itr->second.first;
1246   }
1247 
1248   /// Return the vectorization cost for the given instruction \p I and vector
1249   /// width \p VF.
1250   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1251     assert(VF.isVector() && "Expected VF >=2");
1252     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1253     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1254            "The cost is not calculated");
1255     return WideningDecisions[InstOnVF].second;
1256   }
1257 
1258   /// Return True if instruction \p I is an optimizable truncate whose operand
1259   /// is an induction variable. Such a truncate will be removed by adding a new
1260   /// induction variable with the destination type.
1261   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1262     // If the instruction is not a truncate, return false.
1263     auto *Trunc = dyn_cast<TruncInst>(I);
1264     if (!Trunc)
1265       return false;
1266 
1267     // Get the source and destination types of the truncate.
1268     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1269     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1270 
1271     // If the truncate is free for the given types, return false. Replacing a
1272     // free truncate with an induction variable would add an induction variable
1273     // update instruction to each iteration of the loop. We exclude from this
1274     // check the primary induction variable since it will need an update
1275     // instruction regardless.
1276     Value *Op = Trunc->getOperand(0);
1277     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1278       return false;
1279 
1280     // If the truncated value is not an induction variable, return false.
1281     return Legal->isInductionPhi(Op);
1282   }
1283 
1284   /// Collects the instructions to scalarize for each predicated instruction in
1285   /// the loop.
1286   void collectInstsToScalarize(ElementCount VF);
1287 
1288   /// Collect Uniform and Scalar values for the given \p VF.
1289   /// The sets depend on CM decision for Load/Store instructions
1290   /// that may be vectorized as interleave, gather-scatter or scalarized.
1291   void collectUniformsAndScalars(ElementCount VF) {
1292     // Do the analysis once.
1293     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1294       return;
1295     setCostBasedWideningDecision(VF);
1296     collectLoopUniforms(VF);
1297     collectLoopScalars(VF);
1298   }
1299 
1300   /// Returns true if the target machine supports masked store operation
1301   /// for the given \p DataType and kind of access to \p Ptr.
1302   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1303     return Legal->isConsecutivePtr(Ptr) &&
1304            TTI.isLegalMaskedStore(DataType, Alignment);
1305   }
1306 
1307   /// Returns true if the target machine supports masked load operation
1308   /// for the given \p DataType and kind of access to \p Ptr.
1309   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1310     return Legal->isConsecutivePtr(Ptr) &&
1311            TTI.isLegalMaskedLoad(DataType, Alignment);
1312   }
1313 
1314   /// Returns true if the target machine supports masked scatter operation
1315   /// for the given \p DataType.
1316   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1317     return TTI.isLegalMaskedScatter(DataType, Alignment);
1318   }
1319 
1320   /// Returns true if the target machine supports masked gather operation
1321   /// for the given \p DataType.
1322   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1323     return TTI.isLegalMaskedGather(DataType, Alignment);
1324   }
1325 
1326   /// Returns true if the target machine can represent \p V as a masked gather
1327   /// or scatter operation.
1328   bool isLegalGatherOrScatter(Value *V) {
1329     bool LI = isa<LoadInst>(V);
1330     bool SI = isa<StoreInst>(V);
1331     if (!LI && !SI)
1332       return false;
1333     auto *Ty = getMemInstValueType(V);
1334     Align Align = getLoadStoreAlignment(V);
1335     return (LI && isLegalMaskedGather(Ty, Align)) ||
1336            (SI && isLegalMaskedScatter(Ty, Align));
1337   }
1338 
1339   /// Returns true if \p I is an instruction that will be scalarized with
1340   /// predication. Such instructions include conditional stores and
1341   /// instructions that may divide by zero.
1342   /// If a non-zero VF has been calculated, we check if I will be scalarized
1343   /// predication for that VF.
1344   bool isScalarWithPredication(Instruction *I,
1345                                ElementCount VF = ElementCount::getFixed(1));
1346 
1347   // Returns true if \p I is an instruction that will be predicated either
1348   // through scalar predication or masked load/store or masked gather/scatter.
1349   // Superset of instructions that return true for isScalarWithPredication.
1350   bool isPredicatedInst(Instruction *I) {
1351     if (!blockNeedsPredication(I->getParent()))
1352       return false;
1353     // Loads and stores that need some form of masked operation are predicated
1354     // instructions.
1355     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1356       return Legal->isMaskRequired(I);
1357     return isScalarWithPredication(I);
1358   }
1359 
1360   /// Returns true if \p I is a memory instruction with consecutive memory
1361   /// access that can be widened.
1362   bool
1363   memoryInstructionCanBeWidened(Instruction *I,
1364                                 ElementCount VF = ElementCount::getFixed(1));
1365 
1366   /// Returns true if \p I is a memory instruction in an interleaved-group
1367   /// of memory accesses that can be vectorized with wide vector loads/stores
1368   /// and shuffles.
1369   bool
1370   interleavedAccessCanBeWidened(Instruction *I,
1371                                 ElementCount VF = ElementCount::getFixed(1));
1372 
1373   /// Check if \p Instr belongs to any interleaved access group.
1374   bool isAccessInterleaved(Instruction *Instr) {
1375     return InterleaveInfo.isInterleaved(Instr);
1376   }
1377 
1378   /// Get the interleaved access group that \p Instr belongs to.
1379   const InterleaveGroup<Instruction> *
1380   getInterleavedAccessGroup(Instruction *Instr) {
1381     return InterleaveInfo.getInterleaveGroup(Instr);
1382   }
1383 
1384   /// Returns true if an interleaved group requires a scalar iteration
1385   /// to handle accesses with gaps, and there is nothing preventing us from
1386   /// creating a scalar epilogue.
1387   bool requiresScalarEpilogue() const {
1388     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1389   }
1390 
1391   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1392   /// loop hint annotation.
1393   bool isScalarEpilogueAllowed() const {
1394     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1395   }
1396 
1397   /// Returns true if all loop blocks should be masked to fold tail loop.
1398   bool foldTailByMasking() const { return FoldTailByMasking; }
1399 
1400   bool blockNeedsPredication(BasicBlock *BB) {
1401     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1402   }
1403 
1404   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1405   /// nodes to the chain of instructions representing the reductions. Uses a
1406   /// MapVector to ensure deterministic iteration order.
1407   using ReductionChainMap =
1408       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1409 
1410   /// Return the chain of instructions representing an inloop reduction.
1411   const ReductionChainMap &getInLoopReductionChains() const {
1412     return InLoopReductionChains;
1413   }
1414 
1415   /// Returns true if the Phi is part of an inloop reduction.
1416   bool isInLoopReduction(PHINode *Phi) const {
1417     return InLoopReductionChains.count(Phi);
1418   }
1419 
1420   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1421   /// with factor VF.  Return the cost of the instruction, including
1422   /// scalarization overhead if it's needed.
1423   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1424 
1425   /// Estimate cost of a call instruction CI if it were vectorized with factor
1426   /// VF. Return the cost of the instruction, including scalarization overhead
1427   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1428   /// scalarized -
1429   /// i.e. either vector version isn't available, or is too expensive.
1430   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1431                              bool &NeedToScalarize);
1432 
1433   /// Invalidates decisions already taken by the cost model.
1434   void invalidateCostModelingDecisions() {
1435     WideningDecisions.clear();
1436     Uniforms.clear();
1437     Scalars.clear();
1438   }
1439 
1440 private:
1441   unsigned NumPredStores = 0;
1442 
1443   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1444   /// than zero. One is returned if vectorization should best be avoided due
1445   /// to cost.
1446   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1447                                     ElementCount UserVF);
1448 
1449   /// The vectorization cost is a combination of the cost itself and a boolean
1450   /// indicating whether any of the contributing operations will actually
1451   /// operate on
1452   /// vector values after type legalization in the backend. If this latter value
1453   /// is
1454   /// false, then all operations will be scalarized (i.e. no vectorization has
1455   /// actually taken place).
1456   using VectorizationCostTy = std::pair<unsigned, bool>;
1457 
1458   /// Returns the expected execution cost. The unit of the cost does
1459   /// not matter because we use the 'cost' units to compare different
1460   /// vector widths. The cost that is returned is *not* normalized by
1461   /// the factor width.
1462   VectorizationCostTy expectedCost(ElementCount VF);
1463 
1464   /// Returns the execution time cost of an instruction for a given vector
1465   /// width. Vector width of one means scalar.
1466   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1467 
1468   /// The cost-computation logic from getInstructionCost which provides
1469   /// the vector type as an output parameter.
1470   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1471 
1472   /// Calculate vectorization cost of memory instruction \p I.
1473   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1474 
1475   /// The cost computation for scalarized memory instruction.
1476   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1477 
1478   /// The cost computation for interleaving group of memory instructions.
1479   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1480 
1481   /// The cost computation for Gather/Scatter instruction.
1482   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1483 
1484   /// The cost computation for widening instruction \p I with consecutive
1485   /// memory access.
1486   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1487 
1488   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1489   /// Load: scalar load + broadcast.
1490   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1491   /// element)
1492   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1493 
1494   /// Estimate the overhead of scalarizing an instruction. This is a
1495   /// convenience wrapper for the type-based getScalarizationOverhead API.
1496   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1497 
1498   /// Returns whether the instruction is a load or store and will be a emitted
1499   /// as a vector operation.
1500   bool isConsecutiveLoadOrStore(Instruction *I);
1501 
1502   /// Returns true if an artificially high cost for emulated masked memrefs
1503   /// should be used.
1504   bool useEmulatedMaskMemRefHack(Instruction *I);
1505 
1506   /// Map of scalar integer values to the smallest bitwidth they can be legally
1507   /// represented as. The vector equivalents of these values should be truncated
1508   /// to this type.
1509   MapVector<Instruction *, uint64_t> MinBWs;
1510 
1511   /// A type representing the costs for instructions if they were to be
1512   /// scalarized rather than vectorized. The entries are Instruction-Cost
1513   /// pairs.
1514   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1515 
1516   /// A set containing all BasicBlocks that are known to present after
1517   /// vectorization as a predicated block.
1518   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1519 
1520   /// Records whether it is allowed to have the original scalar loop execute at
1521   /// least once. This may be needed as a fallback loop in case runtime
1522   /// aliasing/dependence checks fail, or to handle the tail/remainder
1523   /// iterations when the trip count is unknown or doesn't divide by the VF,
1524   /// or as a peel-loop to handle gaps in interleave-groups.
1525   /// Under optsize and when the trip count is very small we don't allow any
1526   /// iterations to execute in the scalar loop.
1527   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1528 
1529   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1530   bool FoldTailByMasking = false;
1531 
1532   /// A map holding scalar costs for different vectorization factors. The
1533   /// presence of a cost for an instruction in the mapping indicates that the
1534   /// instruction will be scalarized when vectorizing with the associated
1535   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1536   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1537 
1538   /// Holds the instructions known to be uniform after vectorization.
1539   /// The data is collected per VF.
1540   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1541 
1542   /// Holds the instructions known to be scalar after vectorization.
1543   /// The data is collected per VF.
1544   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1545 
1546   /// Holds the instructions (address computations) that are forced to be
1547   /// scalarized.
1548   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1549 
1550   /// PHINodes of the reductions that should be expanded in-loop along with
1551   /// their associated chains of reduction operations, in program order from top
1552   /// (PHI) to bottom
1553   ReductionChainMap InLoopReductionChains;
1554 
1555   /// Returns the expected difference in cost from scalarizing the expression
1556   /// feeding a predicated instruction \p PredInst. The instructions to
1557   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1558   /// non-negative return value implies the expression will be scalarized.
1559   /// Currently, only single-use chains are considered for scalarization.
1560   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1561                               ElementCount VF);
1562 
1563   /// Collect the instructions that are uniform after vectorization. An
1564   /// instruction is uniform if we represent it with a single scalar value in
1565   /// the vectorized loop corresponding to each vector iteration. Examples of
1566   /// uniform instructions include pointer operands of consecutive or
1567   /// interleaved memory accesses. Note that although uniformity implies an
1568   /// instruction will be scalar, the reverse is not true. In general, a
1569   /// scalarized instruction will be represented by VF scalar values in the
1570   /// vectorized loop, each corresponding to an iteration of the original
1571   /// scalar loop.
1572   void collectLoopUniforms(ElementCount VF);
1573 
1574   /// Collect the instructions that are scalar after vectorization. An
1575   /// instruction is scalar if it is known to be uniform or will be scalarized
1576   /// during vectorization. Non-uniform scalarized instructions will be
1577   /// represented by VF values in the vectorized loop, each corresponding to an
1578   /// iteration of the original scalar loop.
1579   void collectLoopScalars(ElementCount VF);
1580 
1581   /// Keeps cost model vectorization decision and cost for instructions.
1582   /// Right now it is used for memory instructions only.
1583   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1584                                 std::pair<InstWidening, unsigned>>;
1585 
1586   DecisionList WideningDecisions;
1587 
1588   /// Returns true if \p V is expected to be vectorized and it needs to be
1589   /// extracted.
1590   bool needsExtract(Value *V, ElementCount VF) const {
1591     Instruction *I = dyn_cast<Instruction>(V);
1592     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1593         TheLoop->isLoopInvariant(I))
1594       return false;
1595 
1596     // Assume we can vectorize V (and hence we need extraction) if the
1597     // scalars are not computed yet. This can happen, because it is called
1598     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1599     // the scalars are collected. That should be a safe assumption in most
1600     // cases, because we check if the operands have vectorizable types
1601     // beforehand in LoopVectorizationLegality.
1602     return Scalars.find(VF) == Scalars.end() ||
1603            !isScalarAfterVectorization(I, VF);
1604   };
1605 
1606   /// Returns a range containing only operands needing to be extracted.
1607   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1608                                                    ElementCount VF) {
1609     return SmallVector<Value *, 4>(make_filter_range(
1610         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1611   }
1612 
1613 public:
1614   /// The loop that we evaluate.
1615   Loop *TheLoop;
1616 
1617   /// Predicated scalar evolution analysis.
1618   PredicatedScalarEvolution &PSE;
1619 
1620   /// Loop Info analysis.
1621   LoopInfo *LI;
1622 
1623   /// Vectorization legality.
1624   LoopVectorizationLegality *Legal;
1625 
1626   /// Vector target information.
1627   const TargetTransformInfo &TTI;
1628 
1629   /// Target Library Info.
1630   const TargetLibraryInfo *TLI;
1631 
1632   /// Demanded bits analysis.
1633   DemandedBits *DB;
1634 
1635   /// Assumption cache.
1636   AssumptionCache *AC;
1637 
1638   /// Interface to emit optimization remarks.
1639   OptimizationRemarkEmitter *ORE;
1640 
1641   const Function *TheFunction;
1642 
1643   /// Loop Vectorize Hint.
1644   const LoopVectorizeHints *Hints;
1645 
1646   /// The interleave access information contains groups of interleaved accesses
1647   /// with the same stride and close to each other.
1648   InterleavedAccessInfo &InterleaveInfo;
1649 
1650   /// Values to ignore in the cost model.
1651   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1652 
1653   /// Values to ignore in the cost model when VF > 1.
1654   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1655 };
1656 
1657 } // end namespace llvm
1658 
1659 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1660 // vectorization. The loop needs to be annotated with #pragma omp simd
1661 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1662 // vector length information is not provided, vectorization is not considered
1663 // explicit. Interleave hints are not allowed either. These limitations will be
1664 // relaxed in the future.
1665 // Please, note that we are currently forced to abuse the pragma 'clang
1666 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1667 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1668 // provides *explicit vectorization hints* (LV can bypass legal checks and
1669 // assume that vectorization is legal). However, both hints are implemented
1670 // using the same metadata (llvm.loop.vectorize, processed by
1671 // LoopVectorizeHints). This will be fixed in the future when the native IR
1672 // representation for pragma 'omp simd' is introduced.
1673 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1674                                    OptimizationRemarkEmitter *ORE) {
1675   assert(!OuterLp->isInnermost() && "This is not an outer loop");
1676   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1677 
1678   // Only outer loops with an explicit vectorization hint are supported.
1679   // Unannotated outer loops are ignored.
1680   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1681     return false;
1682 
1683   Function *Fn = OuterLp->getHeader()->getParent();
1684   if (!Hints.allowVectorization(Fn, OuterLp,
1685                                 true /*VectorizeOnlyWhenForced*/)) {
1686     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1687     return false;
1688   }
1689 
1690   if (Hints.getInterleave() > 1) {
1691     // TODO: Interleave support is future work.
1692     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1693                          "outer loops.\n");
1694     Hints.emitRemarkWithHints();
1695     return false;
1696   }
1697 
1698   return true;
1699 }
1700 
1701 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1702                                   OptimizationRemarkEmitter *ORE,
1703                                   SmallVectorImpl<Loop *> &V) {
1704   // Collect inner loops and outer loops without irreducible control flow. For
1705   // now, only collect outer loops that have explicit vectorization hints. If we
1706   // are stress testing the VPlan H-CFG construction, we collect the outermost
1707   // loop of every loop nest.
1708   if (L.isInnermost() || VPlanBuildStressTest ||
1709       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1710     LoopBlocksRPO RPOT(&L);
1711     RPOT.perform(LI);
1712     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1713       V.push_back(&L);
1714       // TODO: Collect inner loops inside marked outer loops in case
1715       // vectorization fails for the outer loop. Do not invoke
1716       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1717       // already known to be reducible. We can use an inherited attribute for
1718       // that.
1719       return;
1720     }
1721   }
1722   for (Loop *InnerL : L)
1723     collectSupportedLoops(*InnerL, LI, ORE, V);
1724 }
1725 
1726 namespace {
1727 
1728 /// The LoopVectorize Pass.
1729 struct LoopVectorize : public FunctionPass {
1730   /// Pass identification, replacement for typeid
1731   static char ID;
1732 
1733   LoopVectorizePass Impl;
1734 
1735   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1736                          bool VectorizeOnlyWhenForced = false)
1737       : FunctionPass(ID),
1738         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1739     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1740   }
1741 
1742   bool runOnFunction(Function &F) override {
1743     if (skipFunction(F))
1744       return false;
1745 
1746     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1747     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1748     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1749     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1750     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1751     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1752     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1753     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1754     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1755     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1756     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1757     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1758     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1759 
1760     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1761         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1762 
1763     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1764                         GetLAA, *ORE, PSI).MadeAnyChange;
1765   }
1766 
1767   void getAnalysisUsage(AnalysisUsage &AU) const override {
1768     AU.addRequired<AssumptionCacheTracker>();
1769     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1770     AU.addRequired<DominatorTreeWrapperPass>();
1771     AU.addRequired<LoopInfoWrapperPass>();
1772     AU.addRequired<ScalarEvolutionWrapperPass>();
1773     AU.addRequired<TargetTransformInfoWrapperPass>();
1774     AU.addRequired<AAResultsWrapperPass>();
1775     AU.addRequired<LoopAccessLegacyAnalysis>();
1776     AU.addRequired<DemandedBitsWrapperPass>();
1777     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1778     AU.addRequired<InjectTLIMappingsLegacy>();
1779 
1780     // We currently do not preserve loopinfo/dominator analyses with outer loop
1781     // vectorization. Until this is addressed, mark these analyses as preserved
1782     // only for non-VPlan-native path.
1783     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1784     if (!EnableVPlanNativePath) {
1785       AU.addPreserved<LoopInfoWrapperPass>();
1786       AU.addPreserved<DominatorTreeWrapperPass>();
1787     }
1788 
1789     AU.addPreserved<BasicAAWrapperPass>();
1790     AU.addPreserved<GlobalsAAWrapperPass>();
1791     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1792   }
1793 };
1794 
1795 } // end anonymous namespace
1796 
1797 //===----------------------------------------------------------------------===//
1798 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1799 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1800 //===----------------------------------------------------------------------===//
1801 
1802 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1803   // We need to place the broadcast of invariant variables outside the loop,
1804   // but only if it's proven safe to do so. Else, broadcast will be inside
1805   // vector loop body.
1806   Instruction *Instr = dyn_cast<Instruction>(V);
1807   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1808                      (!Instr ||
1809                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1810   // Place the code for broadcasting invariant variables in the new preheader.
1811   IRBuilder<>::InsertPointGuard Guard(Builder);
1812   if (SafeToHoist)
1813     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1814 
1815   // Broadcast the scalar into all locations in the vector.
1816   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1817 
1818   return Shuf;
1819 }
1820 
1821 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1822     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1823   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1824          "Expected either an induction phi-node or a truncate of it!");
1825   Value *Start = II.getStartValue();
1826 
1827   // Construct the initial value of the vector IV in the vector loop preheader
1828   auto CurrIP = Builder.saveIP();
1829   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1830   if (isa<TruncInst>(EntryVal)) {
1831     assert(Start->getType()->isIntegerTy() &&
1832            "Truncation requires an integer type");
1833     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1834     Step = Builder.CreateTrunc(Step, TruncType);
1835     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1836   }
1837   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1838   Value *SteppedStart =
1839       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1840 
1841   // We create vector phi nodes for both integer and floating-point induction
1842   // variables. Here, we determine the kind of arithmetic we will perform.
1843   Instruction::BinaryOps AddOp;
1844   Instruction::BinaryOps MulOp;
1845   if (Step->getType()->isIntegerTy()) {
1846     AddOp = Instruction::Add;
1847     MulOp = Instruction::Mul;
1848   } else {
1849     AddOp = II.getInductionOpcode();
1850     MulOp = Instruction::FMul;
1851   }
1852 
1853   // Multiply the vectorization factor by the step using integer or
1854   // floating-point arithmetic as appropriate.
1855   Value *ConstVF =
1856       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
1857   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1858 
1859   // Create a vector splat to use in the induction update.
1860   //
1861   // FIXME: If the step is non-constant, we create the vector splat with
1862   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1863   //        handle a constant vector splat.
1864   assert(!VF.isScalable() && "scalable vectors not yet supported.");
1865   Value *SplatVF = isa<Constant>(Mul)
1866                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1867                        : Builder.CreateVectorSplat(VF, Mul);
1868   Builder.restoreIP(CurrIP);
1869 
1870   // We may need to add the step a number of times, depending on the unroll
1871   // factor. The last of those goes into the PHI.
1872   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1873                                     &*LoopVectorBody->getFirstInsertionPt());
1874   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1875   Instruction *LastInduction = VecInd;
1876   for (unsigned Part = 0; Part < UF; ++Part) {
1877     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1878 
1879     if (isa<TruncInst>(EntryVal))
1880       addMetadata(LastInduction, EntryVal);
1881     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1882 
1883     LastInduction = cast<Instruction>(addFastMathFlag(
1884         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1885     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1886   }
1887 
1888   // Move the last step to the end of the latch block. This ensures consistent
1889   // placement of all induction updates.
1890   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1891   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1892   auto *ICmp = cast<Instruction>(Br->getCondition());
1893   LastInduction->moveBefore(ICmp);
1894   LastInduction->setName("vec.ind.next");
1895 
1896   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1897   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1898 }
1899 
1900 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1901   return Cost->isScalarAfterVectorization(I, VF) ||
1902          Cost->isProfitableToScalarize(I, VF);
1903 }
1904 
1905 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1906   if (shouldScalarizeInstruction(IV))
1907     return true;
1908   auto isScalarInst = [&](User *U) -> bool {
1909     auto *I = cast<Instruction>(U);
1910     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1911   };
1912   return llvm::any_of(IV->users(), isScalarInst);
1913 }
1914 
1915 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1916     const InductionDescriptor &ID, const Instruction *EntryVal,
1917     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1918   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1919          "Expected either an induction phi-node or a truncate of it!");
1920 
1921   // This induction variable is not the phi from the original loop but the
1922   // newly-created IV based on the proof that casted Phi is equal to the
1923   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1924   // re-uses the same InductionDescriptor that original IV uses but we don't
1925   // have to do any recording in this case - that is done when original IV is
1926   // processed.
1927   if (isa<TruncInst>(EntryVal))
1928     return;
1929 
1930   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1931   if (Casts.empty())
1932     return;
1933   // Only the first Cast instruction in the Casts vector is of interest.
1934   // The rest of the Casts (if exist) have no uses outside the
1935   // induction update chain itself.
1936   Instruction *CastInst = *Casts.begin();
1937   if (Lane < UINT_MAX)
1938     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1939   else
1940     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1941 }
1942 
1943 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1944   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1945          "Primary induction variable must have an integer type");
1946 
1947   auto II = Legal->getInductionVars().find(IV);
1948   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1949 
1950   auto ID = II->second;
1951   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1952 
1953   // The value from the original loop to which we are mapping the new induction
1954   // variable.
1955   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1956 
1957   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1958 
1959   // Generate code for the induction step. Note that induction steps are
1960   // required to be loop-invariant
1961   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1962     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1963            "Induction step should be loop invariant");
1964     if (PSE.getSE()->isSCEVable(IV->getType())) {
1965       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1966       return Exp.expandCodeFor(Step, Step->getType(),
1967                                LoopVectorPreHeader->getTerminator());
1968     }
1969     return cast<SCEVUnknown>(Step)->getValue();
1970   };
1971 
1972   // The scalar value to broadcast. This is derived from the canonical
1973   // induction variable. If a truncation type is given, truncate the canonical
1974   // induction variable and step. Otherwise, derive these values from the
1975   // induction descriptor.
1976   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1977     Value *ScalarIV = Induction;
1978     if (IV != OldInduction) {
1979       ScalarIV = IV->getType()->isIntegerTy()
1980                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1981                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1982                                           IV->getType());
1983       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1984       ScalarIV->setName("offset.idx");
1985     }
1986     if (Trunc) {
1987       auto *TruncType = cast<IntegerType>(Trunc->getType());
1988       assert(Step->getType()->isIntegerTy() &&
1989              "Truncation requires an integer step");
1990       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1991       Step = Builder.CreateTrunc(Step, TruncType);
1992     }
1993     return ScalarIV;
1994   };
1995 
1996   // Create the vector values from the scalar IV, in the absence of creating a
1997   // vector IV.
1998   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1999     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2000     for (unsigned Part = 0; Part < UF; ++Part) {
2001       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2002       Value *EntryPart =
2003           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2004                         ID.getInductionOpcode());
2005       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2006       if (Trunc)
2007         addMetadata(EntryPart, Trunc);
2008       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2009     }
2010   };
2011 
2012   // Now do the actual transformations, and start with creating the step value.
2013   Value *Step = CreateStepValue(ID.getStep());
2014   if (VF.isZero() || VF.isScalar()) {
2015     Value *ScalarIV = CreateScalarIV(Step);
2016     CreateSplatIV(ScalarIV, Step);
2017     return;
2018   }
2019 
2020   // Determine if we want a scalar version of the induction variable. This is
2021   // true if the induction variable itself is not widened, or if it has at
2022   // least one user in the loop that is not widened.
2023   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2024   if (!NeedsScalarIV) {
2025     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2026     return;
2027   }
2028 
2029   // Try to create a new independent vector induction variable. If we can't
2030   // create the phi node, we will splat the scalar induction variable in each
2031   // loop iteration.
2032   if (!shouldScalarizeInstruction(EntryVal)) {
2033     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2034     Value *ScalarIV = CreateScalarIV(Step);
2035     // Create scalar steps that can be used by instructions we will later
2036     // scalarize. Note that the addition of the scalar steps will not increase
2037     // the number of instructions in the loop in the common case prior to
2038     // InstCombine. We will be trading one vector extract for each scalar step.
2039     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2040     return;
2041   }
2042 
2043   // All IV users are scalar instructions, so only emit a scalar IV, not a
2044   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2045   // predicate used by the masked loads/stores.
2046   Value *ScalarIV = CreateScalarIV(Step);
2047   if (!Cost->isScalarEpilogueAllowed())
2048     CreateSplatIV(ScalarIV, Step);
2049   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2050 }
2051 
2052 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2053                                           Instruction::BinaryOps BinOp) {
2054   // Create and check the types.
2055   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2056   int VLen = ValVTy->getNumElements();
2057 
2058   Type *STy = Val->getType()->getScalarType();
2059   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2060          "Induction Step must be an integer or FP");
2061   assert(Step->getType() == STy && "Step has wrong type");
2062 
2063   SmallVector<Constant *, 8> Indices;
2064 
2065   if (STy->isIntegerTy()) {
2066     // Create a vector of consecutive numbers from zero to VF.
2067     for (int i = 0; i < VLen; ++i)
2068       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2069 
2070     // Add the consecutive indices to the vector value.
2071     Constant *Cv = ConstantVector::get(Indices);
2072     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2073     Step = Builder.CreateVectorSplat(VLen, Step);
2074     assert(Step->getType() == Val->getType() && "Invalid step vec");
2075     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2076     // which can be found from the original scalar operations.
2077     Step = Builder.CreateMul(Cv, Step);
2078     return Builder.CreateAdd(Val, Step, "induction");
2079   }
2080 
2081   // Floating point induction.
2082   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2083          "Binary Opcode should be specified for FP induction");
2084   // Create a vector of consecutive numbers from zero to VF.
2085   for (int i = 0; i < VLen; ++i)
2086     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2087 
2088   // Add the consecutive indices to the vector value.
2089   Constant *Cv = ConstantVector::get(Indices);
2090 
2091   Step = Builder.CreateVectorSplat(VLen, Step);
2092 
2093   // Floating point operations had to be 'fast' to enable the induction.
2094   FastMathFlags Flags;
2095   Flags.setFast();
2096 
2097   Value *MulOp = Builder.CreateFMul(Cv, Step);
2098   if (isa<Instruction>(MulOp))
2099     // Have to check, MulOp may be a constant
2100     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2101 
2102   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2103   if (isa<Instruction>(BOp))
2104     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2105   return BOp;
2106 }
2107 
2108 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2109                                            Instruction *EntryVal,
2110                                            const InductionDescriptor &ID) {
2111   // We shouldn't have to build scalar steps if we aren't vectorizing.
2112   assert(VF.isVector() && "VF should be greater than one");
2113   assert(!VF.isScalable() &&
2114          "the code below assumes a fixed number of elements at compile time");
2115   // Get the value type and ensure it and the step have the same integer type.
2116   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2117   assert(ScalarIVTy == Step->getType() &&
2118          "Val and Step should have the same type");
2119 
2120   // We build scalar steps for both integer and floating-point induction
2121   // variables. Here, we determine the kind of arithmetic we will perform.
2122   Instruction::BinaryOps AddOp;
2123   Instruction::BinaryOps MulOp;
2124   if (ScalarIVTy->isIntegerTy()) {
2125     AddOp = Instruction::Add;
2126     MulOp = Instruction::Mul;
2127   } else {
2128     AddOp = ID.getInductionOpcode();
2129     MulOp = Instruction::FMul;
2130   }
2131 
2132   // Determine the number of scalars we need to generate for each unroll
2133   // iteration. If EntryVal is uniform, we only need to generate the first
2134   // lane. Otherwise, we generate all VF values.
2135   unsigned Lanes =
2136       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2137           ? 1
2138           : VF.getKnownMinValue();
2139   // Compute the scalar steps and save the results in VectorLoopValueMap.
2140   for (unsigned Part = 0; Part < UF; ++Part) {
2141     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2142       auto *StartIdx = getSignedIntOrFpConstant(
2143           ScalarIVTy, VF.getKnownMinValue() * Part + Lane);
2144       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2145       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2146       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2147       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2148     }
2149   }
2150 }
2151 
2152 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2153   assert(V != Induction && "The new induction variable should not be used.");
2154   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2155   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2156 
2157   // If we have a stride that is replaced by one, do it here. Defer this for
2158   // the VPlan-native path until we start running Legal checks in that path.
2159   if (!EnableVPlanNativePath && Legal->hasStride(V))
2160     V = ConstantInt::get(V->getType(), 1);
2161 
2162   // If we have a vector mapped to this value, return it.
2163   if (VectorLoopValueMap.hasVectorValue(V, Part))
2164     return VectorLoopValueMap.getVectorValue(V, Part);
2165 
2166   // If the value has not been vectorized, check if it has been scalarized
2167   // instead. If it has been scalarized, and we actually need the value in
2168   // vector form, we will construct the vector values on demand.
2169   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2170     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2171 
2172     // If we've scalarized a value, that value should be an instruction.
2173     auto *I = cast<Instruction>(V);
2174 
2175     // If we aren't vectorizing, we can just copy the scalar map values over to
2176     // the vector map.
2177     if (VF.isScalar()) {
2178       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2179       return ScalarValue;
2180     }
2181 
2182     // Get the last scalar instruction we generated for V and Part. If the value
2183     // is known to be uniform after vectorization, this corresponds to lane zero
2184     // of the Part unroll iteration. Otherwise, the last instruction is the one
2185     // we created for the last vector lane of the Part unroll iteration.
2186     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2187     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2188                             ? 0
2189                             : VF.getKnownMinValue() - 1;
2190     auto *LastInst = cast<Instruction>(
2191         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2192 
2193     // Set the insert point after the last scalarized instruction. This ensures
2194     // the insertelement sequence will directly follow the scalar definitions.
2195     auto OldIP = Builder.saveIP();
2196     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2197     Builder.SetInsertPoint(&*NewIP);
2198 
2199     // However, if we are vectorizing, we need to construct the vector values.
2200     // If the value is known to be uniform after vectorization, we can just
2201     // broadcast the scalar value corresponding to lane zero for each unroll
2202     // iteration. Otherwise, we construct the vector values using insertelement
2203     // instructions. Since the resulting vectors are stored in
2204     // VectorLoopValueMap, we will only generate the insertelements once.
2205     Value *VectorValue = nullptr;
2206     if (Cost->isUniformAfterVectorization(I, VF)) {
2207       VectorValue = getBroadcastInstrs(ScalarValue);
2208       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2209     } else {
2210       // Initialize packing with insertelements to start from undef.
2211       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2212       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2213       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2214       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2215         packScalarIntoVectorValue(V, {Part, Lane});
2216       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2217     }
2218     Builder.restoreIP(OldIP);
2219     return VectorValue;
2220   }
2221 
2222   // If this scalar is unknown, assume that it is a constant or that it is
2223   // loop invariant. Broadcast V and save the value for future uses.
2224   Value *B = getBroadcastInstrs(V);
2225   VectorLoopValueMap.setVectorValue(V, Part, B);
2226   return B;
2227 }
2228 
2229 Value *
2230 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2231                                             const VPIteration &Instance) {
2232   // If the value is not an instruction contained in the loop, it should
2233   // already be scalar.
2234   if (OrigLoop->isLoopInvariant(V))
2235     return V;
2236 
2237   assert(Instance.Lane > 0
2238              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2239              : true && "Uniform values only have lane zero");
2240 
2241   // If the value from the original loop has not been vectorized, it is
2242   // represented by UF x VF scalar values in the new loop. Return the requested
2243   // scalar value.
2244   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2245     return VectorLoopValueMap.getScalarValue(V, Instance);
2246 
2247   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2248   // for the given unroll part. If this entry is not a vector type (i.e., the
2249   // vectorization factor is one), there is no need to generate an
2250   // extractelement instruction.
2251   auto *U = getOrCreateVectorValue(V, Instance.Part);
2252   if (!U->getType()->isVectorTy()) {
2253     assert(VF.isScalar() && "Value not scalarized has non-vector type");
2254     return U;
2255   }
2256 
2257   // Otherwise, the value from the original loop has been vectorized and is
2258   // represented by UF vector values. Extract and return the requested scalar
2259   // value from the appropriate vector lane.
2260   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2261 }
2262 
2263 void InnerLoopVectorizer::packScalarIntoVectorValue(
2264     Value *V, const VPIteration &Instance) {
2265   assert(V != Induction && "The new induction variable should not be used.");
2266   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2267   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2268 
2269   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2270   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2271   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2272                                             Builder.getInt32(Instance.Lane));
2273   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2274 }
2275 
2276 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2277   assert(Vec->getType()->isVectorTy() && "Invalid type");
2278   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2279   SmallVector<int, 8> ShuffleMask;
2280   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2281     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2282 
2283   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2284 }
2285 
2286 // Return whether we allow using masked interleave-groups (for dealing with
2287 // strided loads/stores that reside in predicated blocks, or for dealing
2288 // with gaps).
2289 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2290   // If an override option has been passed in for interleaved accesses, use it.
2291   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2292     return EnableMaskedInterleavedMemAccesses;
2293 
2294   return TTI.enableMaskedInterleavedAccessVectorization();
2295 }
2296 
2297 // Try to vectorize the interleave group that \p Instr belongs to.
2298 //
2299 // E.g. Translate following interleaved load group (factor = 3):
2300 //   for (i = 0; i < N; i+=3) {
2301 //     R = Pic[i];             // Member of index 0
2302 //     G = Pic[i+1];           // Member of index 1
2303 //     B = Pic[i+2];           // Member of index 2
2304 //     ... // do something to R, G, B
2305 //   }
2306 // To:
2307 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2308 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2309 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2310 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2311 //
2312 // Or translate following interleaved store group (factor = 3):
2313 //   for (i = 0; i < N; i+=3) {
2314 //     ... do something to R, G, B
2315 //     Pic[i]   = R;           // Member of index 0
2316 //     Pic[i+1] = G;           // Member of index 1
2317 //     Pic[i+2] = B;           // Member of index 2
2318 //   }
2319 // To:
2320 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2321 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2322 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2323 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2324 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2325 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2326     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2327     VPValue *Addr, ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask) {
2328   Instruction *Instr = Group->getInsertPos();
2329   const DataLayout &DL = Instr->getModule()->getDataLayout();
2330 
2331   // Prepare for the vector type of the interleaved load/store.
2332   Type *ScalarTy = getMemInstValueType(Instr);
2333   unsigned InterleaveFactor = Group->getFactor();
2334   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2335   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2336 
2337   // Prepare for the new pointers.
2338   SmallVector<Value *, 2> AddrParts;
2339   unsigned Index = Group->getIndex(Instr);
2340 
2341   // TODO: extend the masked interleaved-group support to reversed access.
2342   assert((!BlockInMask || !Group->isReverse()) &&
2343          "Reversed masked interleave-group not supported.");
2344 
2345   // If the group is reverse, adjust the index to refer to the last vector lane
2346   // instead of the first. We adjust the index from the first vector lane,
2347   // rather than directly getting the pointer for lane VF - 1, because the
2348   // pointer operand of the interleaved access is supposed to be uniform. For
2349   // uniform instructions, we're only required to generate a value for the
2350   // first vector lane in each unroll iteration.
2351   assert(!VF.isScalable() &&
2352          "scalable vector reverse operation is not implemented");
2353   if (Group->isReverse())
2354     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2355 
2356   for (unsigned Part = 0; Part < UF; Part++) {
2357     Value *AddrPart = State.get(Addr, {Part, 0});
2358     setDebugLocFromInst(Builder, AddrPart);
2359 
2360     // Notice current instruction could be any index. Need to adjust the address
2361     // to the member of index 0.
2362     //
2363     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2364     //       b = A[i];       // Member of index 0
2365     // Current pointer is pointed to A[i+1], adjust it to A[i].
2366     //
2367     // E.g.  A[i+1] = a;     // Member of index 1
2368     //       A[i]   = b;     // Member of index 0
2369     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2370     // Current pointer is pointed to A[i+2], adjust it to A[i].
2371 
2372     bool InBounds = false;
2373     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2374       InBounds = gep->isInBounds();
2375     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2376     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2377 
2378     // Cast to the vector pointer type.
2379     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2380     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2381     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2382   }
2383 
2384   setDebugLocFromInst(Builder, Instr);
2385   Value *UndefVec = UndefValue::get(VecTy);
2386 
2387   Value *MaskForGaps = nullptr;
2388   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2389     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2390     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2391     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2392   }
2393 
2394   // Vectorize the interleaved load group.
2395   if (isa<LoadInst>(Instr)) {
2396     // For each unroll part, create a wide load for the group.
2397     SmallVector<Value *, 2> NewLoads;
2398     for (unsigned Part = 0; Part < UF; Part++) {
2399       Instruction *NewLoad;
2400       if (BlockInMask || MaskForGaps) {
2401         assert(useMaskedInterleavedAccesses(*TTI) &&
2402                "masked interleaved groups are not allowed.");
2403         Value *GroupMask = MaskForGaps;
2404         if (BlockInMask) {
2405           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2406           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2407           Value *ShuffledMask = Builder.CreateShuffleVector(
2408               BlockInMaskPart,
2409               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2410               "interleaved.mask");
2411           GroupMask = MaskForGaps
2412                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2413                                                 MaskForGaps)
2414                           : ShuffledMask;
2415         }
2416         NewLoad =
2417             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2418                                      GroupMask, UndefVec, "wide.masked.vec");
2419       }
2420       else
2421         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2422                                             Group->getAlign(), "wide.vec");
2423       Group->addMetadata(NewLoad);
2424       NewLoads.push_back(NewLoad);
2425     }
2426 
2427     // For each member in the group, shuffle out the appropriate data from the
2428     // wide loads.
2429     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2430       Instruction *Member = Group->getMember(I);
2431 
2432       // Skip the gaps in the group.
2433       if (!Member)
2434         continue;
2435 
2436       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2437       auto StrideMask =
2438           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2439       for (unsigned Part = 0; Part < UF; Part++) {
2440         Value *StridedVec = Builder.CreateShuffleVector(
2441             NewLoads[Part], StrideMask, "strided.vec");
2442 
2443         // If this member has different type, cast the result type.
2444         if (Member->getType() != ScalarTy) {
2445           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2446           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2447           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2448         }
2449 
2450         if (Group->isReverse())
2451           StridedVec = reverseVector(StridedVec);
2452 
2453         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2454       }
2455     }
2456     return;
2457   }
2458 
2459   // The sub vector type for current instruction.
2460   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2461   auto *SubVT = VectorType::get(ScalarTy, VF);
2462 
2463   // Vectorize the interleaved store group.
2464   for (unsigned Part = 0; Part < UF; Part++) {
2465     // Collect the stored vector from each member.
2466     SmallVector<Value *, 4> StoredVecs;
2467     for (unsigned i = 0; i < InterleaveFactor; i++) {
2468       // Interleaved store group doesn't allow a gap, so each index has a member
2469       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2470 
2471       Value *StoredVec = State.get(StoredValues[i], Part);
2472 
2473       if (Group->isReverse())
2474         StoredVec = reverseVector(StoredVec);
2475 
2476       // If this member has different type, cast it to a unified type.
2477 
2478       if (StoredVec->getType() != SubVT)
2479         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2480 
2481       StoredVecs.push_back(StoredVec);
2482     }
2483 
2484     // Concatenate all vectors into a wide vector.
2485     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2486 
2487     // Interleave the elements in the wide vector.
2488     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2489     Value *IVec = Builder.CreateShuffleVector(
2490         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2491         "interleaved.vec");
2492 
2493     Instruction *NewStoreInstr;
2494     if (BlockInMask) {
2495       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2496       Value *ShuffledMask = Builder.CreateShuffleVector(
2497           BlockInMaskPart,
2498           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2499           "interleaved.mask");
2500       NewStoreInstr = Builder.CreateMaskedStore(
2501           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2502     }
2503     else
2504       NewStoreInstr =
2505           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2506 
2507     Group->addMetadata(NewStoreInstr);
2508   }
2509 }
2510 
2511 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2512     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2513     VPValue *StoredValue, VPValue *BlockInMask) {
2514   // Attempt to issue a wide load.
2515   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2516   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2517 
2518   assert((LI || SI) && "Invalid Load/Store instruction");
2519   assert((!SI || StoredValue) && "No stored value provided for widened store");
2520   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2521 
2522   LoopVectorizationCostModel::InstWidening Decision =
2523       Cost->getWideningDecision(Instr, VF);
2524   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2525           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2526           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2527          "CM decision is not to widen the memory instruction");
2528 
2529   Type *ScalarDataTy = getMemInstValueType(Instr);
2530 
2531   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2532   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2533   const Align Alignment = getLoadStoreAlignment(Instr);
2534 
2535   // Determine if the pointer operand of the access is either consecutive or
2536   // reverse consecutive.
2537   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2538   bool ConsecutiveStride =
2539       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2540   bool CreateGatherScatter =
2541       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2542 
2543   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2544   // gather/scatter. Otherwise Decision should have been to Scalarize.
2545   assert((ConsecutiveStride || CreateGatherScatter) &&
2546          "The instruction should be scalarized");
2547   (void)ConsecutiveStride;
2548 
2549   VectorParts BlockInMaskParts(UF);
2550   bool isMaskRequired = BlockInMask;
2551   if (isMaskRequired)
2552     for (unsigned Part = 0; Part < UF; ++Part)
2553       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2554 
2555   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2556     // Calculate the pointer for the specific unroll-part.
2557     GetElementPtrInst *PartPtr = nullptr;
2558 
2559     bool InBounds = false;
2560     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2561       InBounds = gep->isInBounds();
2562 
2563     if (Reverse) {
2564       // If the address is consecutive but reversed, then the
2565       // wide store needs to start at the last vector element.
2566       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2567           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2568       PartPtr->setIsInBounds(InBounds);
2569       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2570           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2571       PartPtr->setIsInBounds(InBounds);
2572       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2573         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2574     } else {
2575       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2576           ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));
2577       PartPtr->setIsInBounds(InBounds);
2578     }
2579 
2580     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2581     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2582   };
2583 
2584   // Handle Stores:
2585   if (SI) {
2586     setDebugLocFromInst(Builder, SI);
2587 
2588     for (unsigned Part = 0; Part < UF; ++Part) {
2589       Instruction *NewSI = nullptr;
2590       Value *StoredVal = State.get(StoredValue, Part);
2591       if (CreateGatherScatter) {
2592         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2593         Value *VectorGep = State.get(Addr, Part);
2594         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2595                                             MaskPart);
2596       } else {
2597         if (Reverse) {
2598           // If we store to reverse consecutive memory locations, then we need
2599           // to reverse the order of elements in the stored value.
2600           StoredVal = reverseVector(StoredVal);
2601           // We don't want to update the value in the map as it might be used in
2602           // another expression. So don't call resetVectorValue(StoredVal).
2603         }
2604         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2605         if (isMaskRequired)
2606           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2607                                             BlockInMaskParts[Part]);
2608         else
2609           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2610       }
2611       addMetadata(NewSI, SI);
2612     }
2613     return;
2614   }
2615 
2616   // Handle loads.
2617   assert(LI && "Must have a load instruction");
2618   setDebugLocFromInst(Builder, LI);
2619   for (unsigned Part = 0; Part < UF; ++Part) {
2620     Value *NewLI;
2621     if (CreateGatherScatter) {
2622       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2623       Value *VectorGep = State.get(Addr, Part);
2624       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2625                                          nullptr, "wide.masked.gather");
2626       addMetadata(NewLI, LI);
2627     } else {
2628       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2629       if (isMaskRequired)
2630         NewLI = Builder.CreateMaskedLoad(
2631             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2632             "wide.masked.load");
2633       else
2634         NewLI =
2635             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2636 
2637       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2638       addMetadata(NewLI, LI);
2639       if (Reverse)
2640         NewLI = reverseVector(NewLI);
2641     }
2642 
2643     State.set(Def, Instr, NewLI, Part);
2644   }
2645 }
2646 
2647 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2648                                                const VPIteration &Instance,
2649                                                bool IfPredicateInstr,
2650                                                VPTransformState &State) {
2651   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2652 
2653   setDebugLocFromInst(Builder, Instr);
2654 
2655   // Does this instruction return a value ?
2656   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2657 
2658   Instruction *Cloned = Instr->clone();
2659   if (!IsVoidRetTy)
2660     Cloned->setName(Instr->getName() + ".cloned");
2661 
2662   // Replace the operands of the cloned instructions with their scalar
2663   // equivalents in the new loop.
2664   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2665     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2666     auto InputInstance = Instance;
2667     if (!Operand || !OrigLoop->contains(Operand) ||
2668         (Cost->isUniformAfterVectorization(Operand, State.VF)))
2669       InputInstance.Lane = 0;
2670     auto *NewOp = State.get(User.getOperand(op), InputInstance);
2671     Cloned->setOperand(op, NewOp);
2672   }
2673   addNewMetadata(Cloned, Instr);
2674 
2675   // Place the cloned scalar in the new loop.
2676   Builder.Insert(Cloned);
2677 
2678   // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
2679   // representing scalar values in VPTransformState. Add the cloned scalar to
2680   // the scalar map entry.
2681   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2682 
2683   // If we just cloned a new assumption, add it the assumption cache.
2684   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2685     if (II->getIntrinsicID() == Intrinsic::assume)
2686       AC->registerAssumption(II);
2687 
2688   // End if-block.
2689   if (IfPredicateInstr)
2690     PredicatedInstructions.push_back(Cloned);
2691 }
2692 
2693 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2694                                                       Value *End, Value *Step,
2695                                                       Instruction *DL) {
2696   BasicBlock *Header = L->getHeader();
2697   BasicBlock *Latch = L->getLoopLatch();
2698   // As we're just creating this loop, it's possible no latch exists
2699   // yet. If so, use the header as this will be a single block loop.
2700   if (!Latch)
2701     Latch = Header;
2702 
2703   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2704   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2705   setDebugLocFromInst(Builder, OldInst);
2706   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2707 
2708   Builder.SetInsertPoint(Latch->getTerminator());
2709   setDebugLocFromInst(Builder, OldInst);
2710 
2711   // Create i+1 and fill the PHINode.
2712   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2713   Induction->addIncoming(Start, L->getLoopPreheader());
2714   Induction->addIncoming(Next, Latch);
2715   // Create the compare.
2716   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2717   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2718 
2719   // Now we have two terminators. Remove the old one from the block.
2720   Latch->getTerminator()->eraseFromParent();
2721 
2722   return Induction;
2723 }
2724 
2725 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2726   if (TripCount)
2727     return TripCount;
2728 
2729   assert(L && "Create Trip Count for null loop.");
2730   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2731   // Find the loop boundaries.
2732   ScalarEvolution *SE = PSE.getSE();
2733   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2734   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2735          "Invalid loop count");
2736 
2737   Type *IdxTy = Legal->getWidestInductionType();
2738   assert(IdxTy && "No type for induction");
2739 
2740   // The exit count might have the type of i64 while the phi is i32. This can
2741   // happen if we have an induction variable that is sign extended before the
2742   // compare. The only way that we get a backedge taken count is that the
2743   // induction variable was signed and as such will not overflow. In such a case
2744   // truncation is legal.
2745   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2746       IdxTy->getPrimitiveSizeInBits())
2747     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2748   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2749 
2750   // Get the total trip count from the count by adding 1.
2751   const SCEV *ExitCount = SE->getAddExpr(
2752       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2753 
2754   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2755 
2756   // Expand the trip count and place the new instructions in the preheader.
2757   // Notice that the pre-header does not change, only the loop body.
2758   SCEVExpander Exp(*SE, DL, "induction");
2759 
2760   // Count holds the overall loop count (N).
2761   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2762                                 L->getLoopPreheader()->getTerminator());
2763 
2764   if (TripCount->getType()->isPointerTy())
2765     TripCount =
2766         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2767                                     L->getLoopPreheader()->getTerminator());
2768 
2769   return TripCount;
2770 }
2771 
2772 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2773   if (VectorTripCount)
2774     return VectorTripCount;
2775 
2776   Value *TC = getOrCreateTripCount(L);
2777   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2778 
2779   Type *Ty = TC->getType();
2780   // This is where we can make the step a runtime constant.
2781   assert(!VF.isScalable() && "scalable vectorization is not supported yet");
2782   Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF);
2783 
2784   // If the tail is to be folded by masking, round the number of iterations N
2785   // up to a multiple of Step instead of rounding down. This is done by first
2786   // adding Step-1 and then rounding down. Note that it's ok if this addition
2787   // overflows: the vector induction variable will eventually wrap to zero given
2788   // that it starts at zero and its Step is a power of two; the loop will then
2789   // exit, with the last early-exit vector comparison also producing all-true.
2790   if (Cost->foldTailByMasking()) {
2791     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2792            "VF*UF must be a power of 2 when folding tail by masking");
2793     TC = Builder.CreateAdd(
2794         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2795   }
2796 
2797   // Now we need to generate the expression for the part of the loop that the
2798   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2799   // iterations are not required for correctness, or N - Step, otherwise. Step
2800   // is equal to the vectorization factor (number of SIMD elements) times the
2801   // unroll factor (number of SIMD instructions).
2802   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2803 
2804   // If there is a non-reversed interleaved group that may speculatively access
2805   // memory out-of-bounds, we need to ensure that there will be at least one
2806   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2807   // the trip count, we set the remainder to be equal to the step. If the step
2808   // does not evenly divide the trip count, no adjustment is necessary since
2809   // there will already be scalar iterations. Note that the minimum iterations
2810   // check ensures that N >= Step.
2811   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
2812     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2813     R = Builder.CreateSelect(IsZero, Step, R);
2814   }
2815 
2816   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2817 
2818   return VectorTripCount;
2819 }
2820 
2821 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2822                                                    const DataLayout &DL) {
2823   // Verify that V is a vector type with same number of elements as DstVTy.
2824   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2825   unsigned VF = DstFVTy->getNumElements();
2826   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2827   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2828   Type *SrcElemTy = SrcVecTy->getElementType();
2829   Type *DstElemTy = DstFVTy->getElementType();
2830   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2831          "Vector elements must have same size");
2832 
2833   // Do a direct cast if element types are castable.
2834   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2835     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2836   }
2837   // V cannot be directly casted to desired vector type.
2838   // May happen when V is a floating point vector but DstVTy is a vector of
2839   // pointers or vice-versa. Handle this using a two-step bitcast using an
2840   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2841   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2842          "Only one type should be a pointer type");
2843   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2844          "Only one type should be a floating point type");
2845   Type *IntTy =
2846       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2847   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2848   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2849   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2850 }
2851 
2852 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2853                                                          BasicBlock *Bypass) {
2854   Value *Count = getOrCreateTripCount(L);
2855   // Reuse existing vector loop preheader for TC checks.
2856   // Note that new preheader block is generated for vector loop.
2857   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2858   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2859 
2860   // Generate code to check if the loop's trip count is less than VF * UF, or
2861   // equal to it in case a scalar epilogue is required; this implies that the
2862   // vector trip count is zero. This check also covers the case where adding one
2863   // to the backedge-taken count overflowed leading to an incorrect trip count
2864   // of zero. In this case we will also jump to the scalar loop.
2865   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2866                                           : ICmpInst::ICMP_ULT;
2867 
2868   // If tail is to be folded, vector loop takes care of all iterations.
2869   Value *CheckMinIters = Builder.getFalse();
2870   if (!Cost->foldTailByMasking()) {
2871     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2872     CheckMinIters = Builder.CreateICmp(
2873         P, Count,
2874         ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
2875         "min.iters.check");
2876   }
2877   // Create new preheader for vector loop.
2878   LoopVectorPreHeader =
2879       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2880                  "vector.ph");
2881 
2882   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2883                                DT->getNode(Bypass)->getIDom()) &&
2884          "TC check is expected to dominate Bypass");
2885 
2886   // Update dominator for Bypass & LoopExit.
2887   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2888   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2889 
2890   ReplaceInstWithInst(
2891       TCCheckBlock->getTerminator(),
2892       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2893   LoopBypassBlocks.push_back(TCCheckBlock);
2894 }
2895 
2896 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2897   // Reuse existing vector loop preheader for SCEV checks.
2898   // Note that new preheader block is generated for vector loop.
2899   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2900 
2901   // Generate the code to check that the SCEV assumptions that we made.
2902   // We want the new basic block to start at the first instruction in a
2903   // sequence of instructions that form a check.
2904   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2905                    "scev.check");
2906   Value *SCEVCheck = Exp.expandCodeForPredicate(
2907       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2908 
2909   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2910     if (C->isZero())
2911       return;
2912 
2913   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2914            (OptForSizeBasedOnProfile &&
2915             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2916          "Cannot SCEV check stride or overflow when optimizing for size");
2917 
2918   SCEVCheckBlock->setName("vector.scevcheck");
2919   // Create new preheader for vector loop.
2920   LoopVectorPreHeader =
2921       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2922                  nullptr, "vector.ph");
2923 
2924   // Update dominator only if this is first RT check.
2925   if (LoopBypassBlocks.empty()) {
2926     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2927     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2928   }
2929 
2930   ReplaceInstWithInst(
2931       SCEVCheckBlock->getTerminator(),
2932       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2933   LoopBypassBlocks.push_back(SCEVCheckBlock);
2934   AddedSafetyChecks = true;
2935 }
2936 
2937 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2938   // VPlan-native path does not do any analysis for runtime checks currently.
2939   if (EnableVPlanNativePath)
2940     return;
2941 
2942   // Reuse existing vector loop preheader for runtime memory checks.
2943   // Note that new preheader block is generated for vector loop.
2944   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2945 
2946   // Generate the code that checks in runtime if arrays overlap. We put the
2947   // checks into a separate block to make the more common case of few elements
2948   // faster.
2949   auto *LAI = Legal->getLAI();
2950   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2951   if (!RtPtrChecking.Need)
2952     return;
2953 
2954   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2955     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2956            "Cannot emit memory checks when optimizing for size, unless forced "
2957            "to vectorize.");
2958     ORE->emit([&]() {
2959       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2960                                         L->getStartLoc(), L->getHeader())
2961              << "Code-size may be reduced by not forcing "
2962                 "vectorization, or by source-code modifications "
2963                 "eliminating the need for runtime checks "
2964                 "(e.g., adding 'restrict').";
2965     });
2966   }
2967 
2968   MemCheckBlock->setName("vector.memcheck");
2969   // Create new preheader for vector loop.
2970   LoopVectorPreHeader =
2971       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2972                  "vector.ph");
2973 
2974   auto *CondBranch = cast<BranchInst>(
2975       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
2976   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
2977   LoopBypassBlocks.push_back(MemCheckBlock);
2978   AddedSafetyChecks = true;
2979 
2980   // Update dominator only if this is first RT check.
2981   if (LoopBypassBlocks.empty()) {
2982     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2983     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2984   }
2985 
2986   Instruction *FirstCheckInst;
2987   Instruction *MemRuntimeCheck;
2988   std::tie(FirstCheckInst, MemRuntimeCheck) =
2989       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2990                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2991   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
2992                             "claimed checks are required");
2993   CondBranch->setCondition(MemRuntimeCheck);
2994 
2995   // We currently don't use LoopVersioning for the actual loop cloning but we
2996   // still use it to add the noalias metadata.
2997   LVer = std::make_unique<LoopVersioning>(
2998       *Legal->getLAI(),
2999       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3000       DT, PSE.getSE());
3001   LVer->prepareNoAliasMetadata();
3002 }
3003 
3004 Value *InnerLoopVectorizer::emitTransformedIndex(
3005     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3006     const InductionDescriptor &ID) const {
3007 
3008   SCEVExpander Exp(*SE, DL, "induction");
3009   auto Step = ID.getStep();
3010   auto StartValue = ID.getStartValue();
3011   assert(Index->getType() == Step->getType() &&
3012          "Index type does not match StepValue type");
3013 
3014   // Note: the IR at this point is broken. We cannot use SE to create any new
3015   // SCEV and then expand it, hoping that SCEV's simplification will give us
3016   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3017   // lead to various SCEV crashes. So all we can do is to use builder and rely
3018   // on InstCombine for future simplifications. Here we handle some trivial
3019   // cases only.
3020   auto CreateAdd = [&B](Value *X, Value *Y) {
3021     assert(X->getType() == Y->getType() && "Types don't match!");
3022     if (auto *CX = dyn_cast<ConstantInt>(X))
3023       if (CX->isZero())
3024         return Y;
3025     if (auto *CY = dyn_cast<ConstantInt>(Y))
3026       if (CY->isZero())
3027         return X;
3028     return B.CreateAdd(X, Y);
3029   };
3030 
3031   auto CreateMul = [&B](Value *X, Value *Y) {
3032     assert(X->getType() == Y->getType() && "Types don't match!");
3033     if (auto *CX = dyn_cast<ConstantInt>(X))
3034       if (CX->isOne())
3035         return Y;
3036     if (auto *CY = dyn_cast<ConstantInt>(Y))
3037       if (CY->isOne())
3038         return X;
3039     return B.CreateMul(X, Y);
3040   };
3041 
3042   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3043   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3044   // the DomTree is not kept up-to-date for additional blocks generated in the
3045   // vector loop. By using the header as insertion point, we guarantee that the
3046   // expanded instructions dominate all their uses.
3047   auto GetInsertPoint = [this, &B]() {
3048     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3049     if (InsertBB != LoopVectorBody &&
3050         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3051       return LoopVectorBody->getTerminator();
3052     return &*B.GetInsertPoint();
3053   };
3054   switch (ID.getKind()) {
3055   case InductionDescriptor::IK_IntInduction: {
3056     assert(Index->getType() == StartValue->getType() &&
3057            "Index type does not match StartValue type");
3058     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3059       return B.CreateSub(StartValue, Index);
3060     auto *Offset = CreateMul(
3061         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3062     return CreateAdd(StartValue, Offset);
3063   }
3064   case InductionDescriptor::IK_PtrInduction: {
3065     assert(isa<SCEVConstant>(Step) &&
3066            "Expected constant step for pointer induction");
3067     return B.CreateGEP(
3068         StartValue->getType()->getPointerElementType(), StartValue,
3069         CreateMul(Index,
3070                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3071   }
3072   case InductionDescriptor::IK_FpInduction: {
3073     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3074     auto InductionBinOp = ID.getInductionBinOp();
3075     assert(InductionBinOp &&
3076            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3077             InductionBinOp->getOpcode() == Instruction::FSub) &&
3078            "Original bin op should be defined for FP induction");
3079 
3080     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3081 
3082     // Floating point operations had to be 'fast' to enable the induction.
3083     FastMathFlags Flags;
3084     Flags.setFast();
3085 
3086     Value *MulExp = B.CreateFMul(StepValue, Index);
3087     if (isa<Instruction>(MulExp))
3088       // We have to check, the MulExp may be a constant.
3089       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3090 
3091     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3092                                "induction");
3093     if (isa<Instruction>(BOp))
3094       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3095 
3096     return BOp;
3097   }
3098   case InductionDescriptor::IK_NoInduction:
3099     return nullptr;
3100   }
3101   llvm_unreachable("invalid enum");
3102 }
3103 
3104 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3105   LoopScalarBody = OrigLoop->getHeader();
3106   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3107   LoopExitBlock = OrigLoop->getExitBlock();
3108   assert(LoopExitBlock && "Must have an exit block");
3109   assert(LoopVectorPreHeader && "Invalid loop structure");
3110 
3111   LoopMiddleBlock =
3112       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3113                  LI, nullptr, Twine(Prefix) + "middle.block");
3114   LoopScalarPreHeader =
3115       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3116                  nullptr, Twine(Prefix) + "scalar.ph");
3117   // We intentionally don't let SplitBlock to update LoopInfo since
3118   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3119   // LoopVectorBody is explicitly added to the correct place few lines later.
3120   LoopVectorBody =
3121       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3122                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3123 
3124   // Update dominator for loop exit.
3125   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3126 
3127   // Create and register the new vector loop.
3128   Loop *Lp = LI->AllocateLoop();
3129   Loop *ParentLoop = OrigLoop->getParentLoop();
3130 
3131   // Insert the new loop into the loop nest and register the new basic blocks
3132   // before calling any utilities such as SCEV that require valid LoopInfo.
3133   if (ParentLoop) {
3134     ParentLoop->addChildLoop(Lp);
3135   } else {
3136     LI->addTopLevelLoop(Lp);
3137   }
3138   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3139   return Lp;
3140 }
3141 
3142 void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
3143                                                       Value *VectorTripCount) {
3144   assert(VectorTripCount && L && "Expected valid arguments");
3145   // We are going to resume the execution of the scalar loop.
3146   // Go over all of the induction variables that we found and fix the
3147   // PHIs that are left in the scalar version of the loop.
3148   // The starting values of PHI nodes depend on the counter of the last
3149   // iteration in the vectorized loop.
3150   // If we come from a bypass edge then we need to start from the original
3151   // start value.
3152   for (auto &InductionEntry : Legal->getInductionVars()) {
3153     PHINode *OrigPhi = InductionEntry.first;
3154     InductionDescriptor II = InductionEntry.second;
3155 
3156     // Create phi nodes to merge from the  backedge-taken check block.
3157     PHINode *BCResumeVal =
3158         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3159                         LoopScalarPreHeader->getTerminator());
3160     // Copy original phi DL over to the new one.
3161     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3162     Value *&EndValue = IVEndValues[OrigPhi];
3163     if (OrigPhi == OldInduction) {
3164       // We know what the end value is.
3165       EndValue = VectorTripCount;
3166     } else {
3167       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3168       Type *StepType = II.getStep()->getType();
3169       Instruction::CastOps CastOp =
3170           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3171       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3172       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3173       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3174       EndValue->setName("ind.end");
3175     }
3176 
3177     // The new PHI merges the original incoming value, in case of a bypass,
3178     // or the value at the end of the vectorized loop.
3179     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3180 
3181     // Fix the scalar body counter (PHI node).
3182     // The old induction's phi node in the scalar body needs the truncated
3183     // value.
3184     for (BasicBlock *BB : LoopBypassBlocks)
3185       BCResumeVal->addIncoming(II.getStartValue(), BB);
3186     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3187   }
3188 }
3189 
3190 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3191                                                       MDNode *OrigLoopID) {
3192   assert(L && "Expected valid loop.");
3193 
3194   // The trip counts should be cached by now.
3195   Value *Count = getOrCreateTripCount(L);
3196   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3197 
3198   // We need the OrigLoop (scalar loop part) latch terminator to help
3199   // produce correct debug info for the middle block BB instructions.
3200   // The legality check stage guarantees that the loop will have a single
3201   // latch.
3202   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3203          "Scalar loop latch terminator isn't a branch");
3204   BranchInst *ScalarLatchBr =
3205       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3206 
3207   // Add a check in the middle block to see if we have completed
3208   // all of the iterations in the first vector loop.
3209   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3210   // If tail is to be folded, we know we don't need to run the remainder.
3211   Value *CmpN = Builder.getTrue();
3212   if (!Cost->foldTailByMasking()) {
3213     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3214                            VectorTripCount, "cmp.n",
3215                            LoopMiddleBlock->getTerminator());
3216 
3217     // Here we use the same DebugLoc as the scalar loop latch branch instead
3218     // of the corresponding compare because they may have ended up with
3219     // different line numbers and we want to avoid awkward line stepping while
3220     // debugging. Eg. if the compare has got a line number inside the loop.
3221     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3222   }
3223 
3224   BranchInst *BrInst =
3225       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3226   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3227   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3228 
3229   // Get ready to start creating new instructions into the vectorized body.
3230   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3231          "Inconsistent vector loop preheader");
3232   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3233 
3234   Optional<MDNode *> VectorizedLoopID =
3235       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3236                                       LLVMLoopVectorizeFollowupVectorized});
3237   if (VectorizedLoopID.hasValue()) {
3238     L->setLoopID(VectorizedLoopID.getValue());
3239 
3240     // Do not setAlreadyVectorized if loop attributes have been defined
3241     // explicitly.
3242     return LoopVectorPreHeader;
3243   }
3244 
3245   // Keep all loop hints from the original loop on the vector loop (we'll
3246   // replace the vectorizer-specific hints below).
3247   if (MDNode *LID = OrigLoop->getLoopID())
3248     L->setLoopID(LID);
3249 
3250   LoopVectorizeHints Hints(L, true, *ORE);
3251   Hints.setAlreadyVectorized();
3252 
3253 #ifdef EXPENSIVE_CHECKS
3254   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3255   LI->verify(*DT);
3256 #endif
3257 
3258   return LoopVectorPreHeader;
3259 }
3260 
3261 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3262   /*
3263    In this function we generate a new loop. The new loop will contain
3264    the vectorized instructions while the old loop will continue to run the
3265    scalar remainder.
3266 
3267        [ ] <-- loop iteration number check.
3268     /   |
3269    /    v
3270   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3271   |  /  |
3272   | /   v
3273   ||   [ ]     <-- vector pre header.
3274   |/    |
3275   |     v
3276   |    [  ] \
3277   |    [  ]_|   <-- vector loop.
3278   |     |
3279   |     v
3280   |   -[ ]   <--- middle-block.
3281   |  /  |
3282   | /   v
3283   -|- >[ ]     <--- new preheader.
3284    |    |
3285    |    v
3286    |   [ ] \
3287    |   [ ]_|   <-- old scalar loop to handle remainder.
3288     \   |
3289      \  v
3290       >[ ]     <-- exit block.
3291    ...
3292    */
3293 
3294   // Get the metadata of the original loop before it gets modified.
3295   MDNode *OrigLoopID = OrigLoop->getLoopID();
3296 
3297   // Create an empty vector loop, and prepare basic blocks for the runtime
3298   // checks.
3299   Loop *Lp = createVectorLoopSkeleton("");
3300 
3301   // Now, compare the new count to zero. If it is zero skip the vector loop and
3302   // jump to the scalar loop. This check also covers the case where the
3303   // backedge-taken count is uint##_max: adding one to it will overflow leading
3304   // to an incorrect trip count of zero. In this (rare) case we will also jump
3305   // to the scalar loop.
3306   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3307 
3308   // Generate the code to check any assumptions that we've made for SCEV
3309   // expressions.
3310   emitSCEVChecks(Lp, LoopScalarPreHeader);
3311 
3312   // Generate the code that checks in runtime if arrays overlap. We put the
3313   // checks into a separate block to make the more common case of few elements
3314   // faster.
3315   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3316 
3317   // Some loops have a single integer induction variable, while other loops
3318   // don't. One example is c++ iterators that often have multiple pointer
3319   // induction variables. In the code below we also support a case where we
3320   // don't have a single induction variable.
3321   //
3322   // We try to obtain an induction variable from the original loop as hard
3323   // as possible. However if we don't find one that:
3324   //   - is an integer
3325   //   - counts from zero, stepping by one
3326   //   - is the size of the widest induction variable type
3327   // then we create a new one.
3328   OldInduction = Legal->getPrimaryInduction();
3329   Type *IdxTy = Legal->getWidestInductionType();
3330   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3331   // The loop step is equal to the vectorization factor (num of SIMD elements)
3332   // times the unroll factor (num of SIMD instructions).
3333   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3334   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
3335   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3336   Induction =
3337       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3338                               getDebugLocFromInstOrOperands(OldInduction));
3339 
3340   // Emit phis for the new starting index of the scalar loop.
3341   createInductionResumeValues(Lp, CountRoundDown);
3342 
3343   return completeLoopSkeleton(Lp, OrigLoopID);
3344 }
3345 
3346 // Fix up external users of the induction variable. At this point, we are
3347 // in LCSSA form, with all external PHIs that use the IV having one input value,
3348 // coming from the remainder loop. We need those PHIs to also have a correct
3349 // value for the IV when arriving directly from the middle block.
3350 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3351                                        const InductionDescriptor &II,
3352                                        Value *CountRoundDown, Value *EndValue,
3353                                        BasicBlock *MiddleBlock) {
3354   // There are two kinds of external IV usages - those that use the value
3355   // computed in the last iteration (the PHI) and those that use the penultimate
3356   // value (the value that feeds into the phi from the loop latch).
3357   // We allow both, but they, obviously, have different values.
3358 
3359   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3360 
3361   DenseMap<Value *, Value *> MissingVals;
3362 
3363   // An external user of the last iteration's value should see the value that
3364   // the remainder loop uses to initialize its own IV.
3365   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3366   for (User *U : PostInc->users()) {
3367     Instruction *UI = cast<Instruction>(U);
3368     if (!OrigLoop->contains(UI)) {
3369       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3370       MissingVals[UI] = EndValue;
3371     }
3372   }
3373 
3374   // An external user of the penultimate value need to see EndValue - Step.
3375   // The simplest way to get this is to recompute it from the constituent SCEVs,
3376   // that is Start + (Step * (CRD - 1)).
3377   for (User *U : OrigPhi->users()) {
3378     auto *UI = cast<Instruction>(U);
3379     if (!OrigLoop->contains(UI)) {
3380       const DataLayout &DL =
3381           OrigLoop->getHeader()->getModule()->getDataLayout();
3382       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3383 
3384       IRBuilder<> B(MiddleBlock->getTerminator());
3385       Value *CountMinusOne = B.CreateSub(
3386           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3387       Value *CMO =
3388           !II.getStep()->getType()->isIntegerTy()
3389               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3390                              II.getStep()->getType())
3391               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3392       CMO->setName("cast.cmo");
3393       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3394       Escape->setName("ind.escape");
3395       MissingVals[UI] = Escape;
3396     }
3397   }
3398 
3399   for (auto &I : MissingVals) {
3400     PHINode *PHI = cast<PHINode>(I.first);
3401     // One corner case we have to handle is two IVs "chasing" each-other,
3402     // that is %IV2 = phi [...], [ %IV1, %latch ]
3403     // In this case, if IV1 has an external use, we need to avoid adding both
3404     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3405     // don't already have an incoming value for the middle block.
3406     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3407       PHI->addIncoming(I.second, MiddleBlock);
3408   }
3409 }
3410 
3411 namespace {
3412 
3413 struct CSEDenseMapInfo {
3414   static bool canHandle(const Instruction *I) {
3415     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3416            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3417   }
3418 
3419   static inline Instruction *getEmptyKey() {
3420     return DenseMapInfo<Instruction *>::getEmptyKey();
3421   }
3422 
3423   static inline Instruction *getTombstoneKey() {
3424     return DenseMapInfo<Instruction *>::getTombstoneKey();
3425   }
3426 
3427   static unsigned getHashValue(const Instruction *I) {
3428     assert(canHandle(I) && "Unknown instruction!");
3429     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3430                                                            I->value_op_end()));
3431   }
3432 
3433   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3434     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3435         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3436       return LHS == RHS;
3437     return LHS->isIdenticalTo(RHS);
3438   }
3439 };
3440 
3441 } // end anonymous namespace
3442 
3443 ///Perform cse of induction variable instructions.
3444 static void cse(BasicBlock *BB) {
3445   // Perform simple cse.
3446   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3447   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3448     Instruction *In = &*I++;
3449 
3450     if (!CSEDenseMapInfo::canHandle(In))
3451       continue;
3452 
3453     // Check if we can replace this instruction with any of the
3454     // visited instructions.
3455     if (Instruction *V = CSEMap.lookup(In)) {
3456       In->replaceAllUsesWith(V);
3457       In->eraseFromParent();
3458       continue;
3459     }
3460 
3461     CSEMap[In] = In;
3462   }
3463 }
3464 
3465 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3466                                                        ElementCount VF,
3467                                                        bool &NeedToScalarize) {
3468   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3469   Function *F = CI->getCalledFunction();
3470   Type *ScalarRetTy = CI->getType();
3471   SmallVector<Type *, 4> Tys, ScalarTys;
3472   for (auto &ArgOp : CI->arg_operands())
3473     ScalarTys.push_back(ArgOp->getType());
3474 
3475   // Estimate cost of scalarized vector call. The source operands are assumed
3476   // to be vectors, so we need to extract individual elements from there,
3477   // execute VF scalar calls, and then gather the result into the vector return
3478   // value.
3479   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3480                                                  TTI::TCK_RecipThroughput);
3481   if (VF.isScalar())
3482     return ScalarCallCost;
3483 
3484   // Compute corresponding vector type for return value and arguments.
3485   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3486   for (Type *ScalarTy : ScalarTys)
3487     Tys.push_back(ToVectorTy(ScalarTy, VF));
3488 
3489   // Compute costs of unpacking argument values for the scalar calls and
3490   // packing the return values to a vector.
3491   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3492 
3493   unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3494 
3495   // If we can't emit a vector call for this function, then the currently found
3496   // cost is the cost we need to return.
3497   NeedToScalarize = true;
3498   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3499   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3500 
3501   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3502     return Cost;
3503 
3504   // If the corresponding vector cost is cheaper, return its cost.
3505   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3506                                                  TTI::TCK_RecipThroughput);
3507   if (VectorCallCost < Cost) {
3508     NeedToScalarize = false;
3509     return VectorCallCost;
3510   }
3511   return Cost;
3512 }
3513 
3514 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3515                                                             ElementCount VF) {
3516   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3517   assert(ID && "Expected intrinsic call!");
3518 
3519   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3520   return TTI.getIntrinsicInstrCost(CostAttrs,
3521                                    TargetTransformInfo::TCK_RecipThroughput);
3522 }
3523 
3524 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3525   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3526   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3527   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3528 }
3529 
3530 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3531   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3532   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3533   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3534 }
3535 
3536 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3537   // For every instruction `I` in MinBWs, truncate the operands, create a
3538   // truncated version of `I` and reextend its result. InstCombine runs
3539   // later and will remove any ext/trunc pairs.
3540   SmallPtrSet<Value *, 4> Erased;
3541   for (const auto &KV : Cost->getMinimalBitwidths()) {
3542     // If the value wasn't vectorized, we must maintain the original scalar
3543     // type. The absence of the value from VectorLoopValueMap indicates that it
3544     // wasn't vectorized.
3545     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3546       continue;
3547     for (unsigned Part = 0; Part < UF; ++Part) {
3548       Value *I = getOrCreateVectorValue(KV.first, Part);
3549       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3550         continue;
3551       Type *OriginalTy = I->getType();
3552       Type *ScalarTruncatedTy =
3553           IntegerType::get(OriginalTy->getContext(), KV.second);
3554       auto *TruncatedTy = FixedVectorType::get(
3555           ScalarTruncatedTy,
3556           cast<FixedVectorType>(OriginalTy)->getNumElements());
3557       if (TruncatedTy == OriginalTy)
3558         continue;
3559 
3560       IRBuilder<> B(cast<Instruction>(I));
3561       auto ShrinkOperand = [&](Value *V) -> Value * {
3562         if (auto *ZI = dyn_cast<ZExtInst>(V))
3563           if (ZI->getSrcTy() == TruncatedTy)
3564             return ZI->getOperand(0);
3565         return B.CreateZExtOrTrunc(V, TruncatedTy);
3566       };
3567 
3568       // The actual instruction modification depends on the instruction type,
3569       // unfortunately.
3570       Value *NewI = nullptr;
3571       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3572         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3573                              ShrinkOperand(BO->getOperand(1)));
3574 
3575         // Any wrapping introduced by shrinking this operation shouldn't be
3576         // considered undefined behavior. So, we can't unconditionally copy
3577         // arithmetic wrapping flags to NewI.
3578         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3579       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3580         NewI =
3581             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3582                          ShrinkOperand(CI->getOperand(1)));
3583       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3584         NewI = B.CreateSelect(SI->getCondition(),
3585                               ShrinkOperand(SI->getTrueValue()),
3586                               ShrinkOperand(SI->getFalseValue()));
3587       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3588         switch (CI->getOpcode()) {
3589         default:
3590           llvm_unreachable("Unhandled cast!");
3591         case Instruction::Trunc:
3592           NewI = ShrinkOperand(CI->getOperand(0));
3593           break;
3594         case Instruction::SExt:
3595           NewI = B.CreateSExtOrTrunc(
3596               CI->getOperand(0),
3597               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3598           break;
3599         case Instruction::ZExt:
3600           NewI = B.CreateZExtOrTrunc(
3601               CI->getOperand(0),
3602               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3603           break;
3604         }
3605       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3606         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3607                              ->getNumElements();
3608         auto *O0 = B.CreateZExtOrTrunc(
3609             SI->getOperand(0),
3610             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3611         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3612                              ->getNumElements();
3613         auto *O1 = B.CreateZExtOrTrunc(
3614             SI->getOperand(1),
3615             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3616 
3617         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3618       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3619         // Don't do anything with the operands, just extend the result.
3620         continue;
3621       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3622         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3623                             ->getNumElements();
3624         auto *O0 = B.CreateZExtOrTrunc(
3625             IE->getOperand(0),
3626             FixedVectorType::get(ScalarTruncatedTy, Elements));
3627         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3628         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3629       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3630         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3631                             ->getNumElements();
3632         auto *O0 = B.CreateZExtOrTrunc(
3633             EE->getOperand(0),
3634             FixedVectorType::get(ScalarTruncatedTy, Elements));
3635         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3636       } else {
3637         // If we don't know what to do, be conservative and don't do anything.
3638         continue;
3639       }
3640 
3641       // Lastly, extend the result.
3642       NewI->takeName(cast<Instruction>(I));
3643       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3644       I->replaceAllUsesWith(Res);
3645       cast<Instruction>(I)->eraseFromParent();
3646       Erased.insert(I);
3647       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3648     }
3649   }
3650 
3651   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3652   for (const auto &KV : Cost->getMinimalBitwidths()) {
3653     // If the value wasn't vectorized, we must maintain the original scalar
3654     // type. The absence of the value from VectorLoopValueMap indicates that it
3655     // wasn't vectorized.
3656     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3657       continue;
3658     for (unsigned Part = 0; Part < UF; ++Part) {
3659       Value *I = getOrCreateVectorValue(KV.first, Part);
3660       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3661       if (Inst && Inst->use_empty()) {
3662         Value *NewI = Inst->getOperand(0);
3663         Inst->eraseFromParent();
3664         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3665       }
3666     }
3667   }
3668 }
3669 
3670 void InnerLoopVectorizer::fixVectorizedLoop() {
3671   // Insert truncates and extends for any truncated instructions as hints to
3672   // InstCombine.
3673   if (VF.isVector())
3674     truncateToMinimalBitwidths();
3675 
3676   // Fix widened non-induction PHIs by setting up the PHI operands.
3677   if (OrigPHIsToFix.size()) {
3678     assert(EnableVPlanNativePath &&
3679            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3680     fixNonInductionPHIs();
3681   }
3682 
3683   // At this point every instruction in the original loop is widened to a
3684   // vector form. Now we need to fix the recurrences in the loop. These PHI
3685   // nodes are currently empty because we did not want to introduce cycles.
3686   // This is the second stage of vectorizing recurrences.
3687   fixCrossIterationPHIs();
3688 
3689   // Forget the original basic block.
3690   PSE.getSE()->forgetLoop(OrigLoop);
3691 
3692   // Fix-up external users of the induction variables.
3693   for (auto &Entry : Legal->getInductionVars())
3694     fixupIVUsers(Entry.first, Entry.second,
3695                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3696                  IVEndValues[Entry.first], LoopMiddleBlock);
3697 
3698   fixLCSSAPHIs();
3699   for (Instruction *PI : PredicatedInstructions)
3700     sinkScalarOperands(&*PI);
3701 
3702   // Remove redundant induction instructions.
3703   cse(LoopVectorBody);
3704 
3705   // Set/update profile weights for the vector and remainder loops as original
3706   // loop iterations are now distributed among them. Note that original loop
3707   // represented by LoopScalarBody becomes remainder loop after vectorization.
3708   //
3709   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3710   // end up getting slightly roughened result but that should be OK since
3711   // profile is not inherently precise anyway. Note also possible bypass of
3712   // vector code caused by legality checks is ignored, assigning all the weight
3713   // to the vector loop, optimistically.
3714   assert(!VF.isScalable() &&
3715          "cannot use scalable ElementCount to determine unroll factor");
3716   setProfileInfoAfterUnrolling(
3717       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3718       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3719 }
3720 
3721 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3722   // In order to support recurrences we need to be able to vectorize Phi nodes.
3723   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3724   // stage #2: We now need to fix the recurrences by adding incoming edges to
3725   // the currently empty PHI nodes. At this point every instruction in the
3726   // original loop is widened to a vector form so we can use them to construct
3727   // the incoming edges.
3728   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3729     // Handle first-order recurrences and reductions that need to be fixed.
3730     if (Legal->isFirstOrderRecurrence(&Phi))
3731       fixFirstOrderRecurrence(&Phi);
3732     else if (Legal->isReductionVariable(&Phi))
3733       fixReduction(&Phi);
3734   }
3735 }
3736 
3737 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3738   // This is the second phase of vectorizing first-order recurrences. An
3739   // overview of the transformation is described below. Suppose we have the
3740   // following loop.
3741   //
3742   //   for (int i = 0; i < n; ++i)
3743   //     b[i] = a[i] - a[i - 1];
3744   //
3745   // There is a first-order recurrence on "a". For this loop, the shorthand
3746   // scalar IR looks like:
3747   //
3748   //   scalar.ph:
3749   //     s_init = a[-1]
3750   //     br scalar.body
3751   //
3752   //   scalar.body:
3753   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3754   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3755   //     s2 = a[i]
3756   //     b[i] = s2 - s1
3757   //     br cond, scalar.body, ...
3758   //
3759   // In this example, s1 is a recurrence because it's value depends on the
3760   // previous iteration. In the first phase of vectorization, we created a
3761   // temporary value for s1. We now complete the vectorization and produce the
3762   // shorthand vector IR shown below (for VF = 4, UF = 1).
3763   //
3764   //   vector.ph:
3765   //     v_init = vector(..., ..., ..., a[-1])
3766   //     br vector.body
3767   //
3768   //   vector.body
3769   //     i = phi [0, vector.ph], [i+4, vector.body]
3770   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3771   //     v2 = a[i, i+1, i+2, i+3];
3772   //     v3 = vector(v1(3), v2(0, 1, 2))
3773   //     b[i, i+1, i+2, i+3] = v2 - v3
3774   //     br cond, vector.body, middle.block
3775   //
3776   //   middle.block:
3777   //     x = v2(3)
3778   //     br scalar.ph
3779   //
3780   //   scalar.ph:
3781   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3782   //     br scalar.body
3783   //
3784   // After execution completes the vector loop, we extract the next value of
3785   // the recurrence (x) to use as the initial value in the scalar loop.
3786 
3787   // Get the original loop preheader and single loop latch.
3788   auto *Preheader = OrigLoop->getLoopPreheader();
3789   auto *Latch = OrigLoop->getLoopLatch();
3790 
3791   // Get the initial and previous values of the scalar recurrence.
3792   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3793   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3794 
3795   // Create a vector from the initial value.
3796   auto *VectorInit = ScalarInit;
3797   if (VF.isVector()) {
3798     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3799     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
3800     VectorInit = Builder.CreateInsertElement(
3801         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3802         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
3803   }
3804 
3805   // We constructed a temporary phi node in the first phase of vectorization.
3806   // This phi node will eventually be deleted.
3807   Builder.SetInsertPoint(
3808       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3809 
3810   // Create a phi node for the new recurrence. The current value will either be
3811   // the initial value inserted into a vector or loop-varying vector value.
3812   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3813   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3814 
3815   // Get the vectorized previous value of the last part UF - 1. It appears last
3816   // among all unrolled iterations, due to the order of their construction.
3817   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3818 
3819   // Find and set the insertion point after the previous value if it is an
3820   // instruction.
3821   BasicBlock::iterator InsertPt;
3822   // Note that the previous value may have been constant-folded so it is not
3823   // guaranteed to be an instruction in the vector loop.
3824   // FIXME: Loop invariant values do not form recurrences. We should deal with
3825   //        them earlier.
3826   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3827     InsertPt = LoopVectorBody->getFirstInsertionPt();
3828   else {
3829     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3830     if (isa<PHINode>(PreviousLastPart))
3831       // If the previous value is a phi node, we should insert after all the phi
3832       // nodes in the block containing the PHI to avoid breaking basic block
3833       // verification. Note that the basic block may be different to
3834       // LoopVectorBody, in case we predicate the loop.
3835       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3836     else
3837       InsertPt = ++PreviousInst->getIterator();
3838   }
3839   Builder.SetInsertPoint(&*InsertPt);
3840 
3841   // We will construct a vector for the recurrence by combining the values for
3842   // the current and previous iterations. This is the required shuffle mask.
3843   assert(!VF.isScalable());
3844   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
3845   ShuffleMask[0] = VF.getKnownMinValue() - 1;
3846   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
3847     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
3848 
3849   // The vector from which to take the initial value for the current iteration
3850   // (actual or unrolled). Initially, this is the vector phi node.
3851   Value *Incoming = VecPhi;
3852 
3853   // Shuffle the current and previous vector and update the vector parts.
3854   for (unsigned Part = 0; Part < UF; ++Part) {
3855     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3856     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3857     auto *Shuffle =
3858         VF.isVector()
3859             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
3860             : Incoming;
3861     PhiPart->replaceAllUsesWith(Shuffle);
3862     cast<Instruction>(PhiPart)->eraseFromParent();
3863     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3864     Incoming = PreviousPart;
3865   }
3866 
3867   // Fix the latch value of the new recurrence in the vector loop.
3868   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3869 
3870   // Extract the last vector element in the middle block. This will be the
3871   // initial value for the recurrence when jumping to the scalar loop.
3872   auto *ExtractForScalar = Incoming;
3873   if (VF.isVector()) {
3874     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3875     ExtractForScalar = Builder.CreateExtractElement(
3876         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
3877         "vector.recur.extract");
3878   }
3879   // Extract the second last element in the middle block if the
3880   // Phi is used outside the loop. We need to extract the phi itself
3881   // and not the last element (the phi update in the current iteration). This
3882   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3883   // when the scalar loop is not run at all.
3884   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3885   if (VF.isVector())
3886     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3887         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
3888         "vector.recur.extract.for.phi");
3889   // When loop is unrolled without vectorizing, initialize
3890   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3891   // `Incoming`. This is analogous to the vectorized case above: extracting the
3892   // second last element when VF > 1.
3893   else if (UF > 1)
3894     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3895 
3896   // Fix the initial value of the original recurrence in the scalar loop.
3897   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3898   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3899   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3900     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3901     Start->addIncoming(Incoming, BB);
3902   }
3903 
3904   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3905   Phi->setName("scalar.recur");
3906 
3907   // Finally, fix users of the recurrence outside the loop. The users will need
3908   // either the last value of the scalar recurrence or the last value of the
3909   // vector recurrence we extracted in the middle block. Since the loop is in
3910   // LCSSA form, we just need to find all the phi nodes for the original scalar
3911   // recurrence in the exit block, and then add an edge for the middle block.
3912   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3913     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3914       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3915     }
3916   }
3917 }
3918 
3919 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3920   Constant *Zero = Builder.getInt32(0);
3921 
3922   // Get it's reduction variable descriptor.
3923   assert(Legal->isReductionVariable(Phi) &&
3924          "Unable to find the reduction variable");
3925   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3926 
3927   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3928   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3929   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3930   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3931     RdxDesc.getMinMaxRecurrenceKind();
3932   setDebugLocFromInst(Builder, ReductionStartValue);
3933   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
3934 
3935   // We need to generate a reduction vector from the incoming scalar.
3936   // To do so, we need to generate the 'identity' vector and override
3937   // one of the elements with the incoming scalar reduction. We need
3938   // to do it in the vector-loop preheader.
3939   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3940 
3941   // This is the vector-clone of the value that leaves the loop.
3942   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3943 
3944   // Find the reduction identity variable. Zero for addition, or, xor,
3945   // one for multiplication, -1 for And.
3946   Value *Identity;
3947   Value *VectorStart;
3948   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3949       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3950     // MinMax reduction have the start value as their identify.
3951     if (VF.isScalar() || IsInLoopReductionPhi) {
3952       VectorStart = Identity = ReductionStartValue;
3953     } else {
3954       VectorStart = Identity =
3955         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3956     }
3957   } else {
3958     // Handle other reduction kinds:
3959     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3960         RK, MinMaxKind, VecTy->getScalarType());
3961     if (VF.isScalar() || IsInLoopReductionPhi) {
3962       Identity = Iden;
3963       // This vector is the Identity vector where the first element is the
3964       // incoming scalar reduction.
3965       VectorStart = ReductionStartValue;
3966     } else {
3967       Identity = ConstantVector::getSplat(VF, Iden);
3968 
3969       // This vector is the Identity vector where the first element is the
3970       // incoming scalar reduction.
3971       VectorStart =
3972         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3973     }
3974   }
3975 
3976   // Wrap flags are in general invalid after vectorization, clear them.
3977   clearReductionWrapFlags(RdxDesc);
3978 
3979   // Fix the vector-loop phi.
3980 
3981   // Reductions do not have to start at zero. They can start with
3982   // any loop invariant values.
3983   BasicBlock *Latch = OrigLoop->getLoopLatch();
3984   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3985 
3986   for (unsigned Part = 0; Part < UF; ++Part) {
3987     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3988     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3989     // Make sure to add the reduction start value only to the
3990     // first unroll part.
3991     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3992     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3993     cast<PHINode>(VecRdxPhi)
3994       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3995   }
3996 
3997   // Before each round, move the insertion point right between
3998   // the PHIs and the values we are going to write.
3999   // This allows us to write both PHINodes and the extractelement
4000   // instructions.
4001   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4002 
4003   setDebugLocFromInst(Builder, LoopExitInst);
4004 
4005   // If tail is folded by masking, the vector value to leave the loop should be
4006   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4007   // instead of the former. For an inloop reduction the reduction will already
4008   // be predicated, and does not need to be handled here.
4009   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4010     for (unsigned Part = 0; Part < UF; ++Part) {
4011       Value *VecLoopExitInst =
4012           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4013       Value *Sel = nullptr;
4014       for (User *U : VecLoopExitInst->users()) {
4015         if (isa<SelectInst>(U)) {
4016           assert(!Sel && "Reduction exit feeding two selects");
4017           Sel = U;
4018         } else
4019           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4020       }
4021       assert(Sel && "Reduction exit feeds no select");
4022       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4023 
4024       // If the target can create a predicated operator for the reduction at no
4025       // extra cost in the loop (for example a predicated vadd), it can be
4026       // cheaper for the select to remain in the loop than be sunk out of it,
4027       // and so use the select value for the phi instead of the old
4028       // LoopExitValue.
4029       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4030       if (PreferPredicatedReductionSelect ||
4031           TTI->preferPredicatedReductionSelect(
4032               RdxDesc.getRecurrenceBinOp(), Phi->getType(),
4033               TargetTransformInfo::ReductionFlags())) {
4034         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4035         VecRdxPhi->setIncomingValueForBlock(
4036             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4037       }
4038     }
4039   }
4040 
4041   // If the vector reduction can be performed in a smaller type, we truncate
4042   // then extend the loop exit value to enable InstCombine to evaluate the
4043   // entire expression in the smaller type.
4044   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4045     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4046     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4047     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4048     Builder.SetInsertPoint(
4049         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4050     VectorParts RdxParts(UF);
4051     for (unsigned Part = 0; Part < UF; ++Part) {
4052       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4053       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4054       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4055                                         : Builder.CreateZExt(Trunc, VecTy);
4056       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4057            UI != RdxParts[Part]->user_end();)
4058         if (*UI != Trunc) {
4059           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4060           RdxParts[Part] = Extnd;
4061         } else {
4062           ++UI;
4063         }
4064     }
4065     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4066     for (unsigned Part = 0; Part < UF; ++Part) {
4067       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4068       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4069     }
4070   }
4071 
4072   // Reduce all of the unrolled parts into a single vector.
4073   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4074   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4075 
4076   // The middle block terminator has already been assigned a DebugLoc here (the
4077   // OrigLoop's single latch terminator). We want the whole middle block to
4078   // appear to execute on this line because: (a) it is all compiler generated,
4079   // (b) these instructions are always executed after evaluating the latch
4080   // conditional branch, and (c) other passes may add new predecessors which
4081   // terminate on this line. This is the easiest way to ensure we don't
4082   // accidentally cause an extra step back into the loop while debugging.
4083   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4084   for (unsigned Part = 1; Part < UF; ++Part) {
4085     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4086     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4087       // Floating point operations had to be 'fast' to enable the reduction.
4088       ReducedPartRdx = addFastMathFlag(
4089           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4090                               ReducedPartRdx, "bin.rdx"),
4091           RdxDesc.getFastMathFlags());
4092     else
4093       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4094                                       RdxPart);
4095   }
4096 
4097   // Create the reduction after the loop. Note that inloop reductions create the
4098   // target reduction in the loop using a Reduction recipe.
4099   if (VF.isVector() && !IsInLoopReductionPhi) {
4100     bool NoNaN = Legal->hasFunNoNaNAttr();
4101     ReducedPartRdx =
4102         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4103     // If the reduction can be performed in a smaller type, we need to extend
4104     // the reduction to the wider type before we branch to the original loop.
4105     if (Phi->getType() != RdxDesc.getRecurrenceType())
4106       ReducedPartRdx =
4107         RdxDesc.isSigned()
4108         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4109         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4110   }
4111 
4112   // Create a phi node that merges control-flow from the backedge-taken check
4113   // block and the middle block.
4114   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4115                                         LoopScalarPreHeader->getTerminator());
4116   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4117     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4118   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4119 
4120   // Now, we need to fix the users of the reduction variable
4121   // inside and outside of the scalar remainder loop.
4122   // We know that the loop is in LCSSA form. We need to update the
4123   // PHI nodes in the exit blocks.
4124   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4125     // All PHINodes need to have a single entry edge, or two if
4126     // we already fixed them.
4127     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4128 
4129     // We found a reduction value exit-PHI. Update it with the
4130     // incoming bypass edge.
4131     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4132       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4133   } // end of the LCSSA phi scan.
4134 
4135     // Fix the scalar loop reduction variable with the incoming reduction sum
4136     // from the vector body and from the backedge value.
4137   int IncomingEdgeBlockIdx =
4138     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4139   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4140   // Pick the other block.
4141   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4142   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4143   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4144 }
4145 
4146 void InnerLoopVectorizer::clearReductionWrapFlags(
4147     RecurrenceDescriptor &RdxDesc) {
4148   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4149   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4150       RK != RecurrenceDescriptor::RK_IntegerMult)
4151     return;
4152 
4153   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4154   assert(LoopExitInstr && "null loop exit instruction");
4155   SmallVector<Instruction *, 8> Worklist;
4156   SmallPtrSet<Instruction *, 8> Visited;
4157   Worklist.push_back(LoopExitInstr);
4158   Visited.insert(LoopExitInstr);
4159 
4160   while (!Worklist.empty()) {
4161     Instruction *Cur = Worklist.pop_back_val();
4162     if (isa<OverflowingBinaryOperator>(Cur))
4163       for (unsigned Part = 0; Part < UF; ++Part) {
4164         Value *V = getOrCreateVectorValue(Cur, Part);
4165         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4166       }
4167 
4168     for (User *U : Cur->users()) {
4169       Instruction *UI = cast<Instruction>(U);
4170       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4171           Visited.insert(UI).second)
4172         Worklist.push_back(UI);
4173     }
4174   }
4175 }
4176 
4177 void InnerLoopVectorizer::fixLCSSAPHIs() {
4178   assert(!VF.isScalable() && "the code below assumes fixed width vectors");
4179   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4180     if (LCSSAPhi.getNumIncomingValues() == 1) {
4181       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4182       // Non-instruction incoming values will have only one value.
4183       unsigned LastLane = 0;
4184       if (isa<Instruction>(IncomingValue))
4185         LastLane = Cost->isUniformAfterVectorization(
4186                        cast<Instruction>(IncomingValue), VF)
4187                        ? 0
4188                        : VF.getKnownMinValue() - 1;
4189       // Can be a loop invariant incoming value or the last scalar value to be
4190       // extracted from the vectorized loop.
4191       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4192       Value *lastIncomingValue =
4193           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4194       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4195     }
4196   }
4197 }
4198 
4199 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4200   // The basic block and loop containing the predicated instruction.
4201   auto *PredBB = PredInst->getParent();
4202   auto *VectorLoop = LI->getLoopFor(PredBB);
4203 
4204   // Initialize a worklist with the operands of the predicated instruction.
4205   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4206 
4207   // Holds instructions that we need to analyze again. An instruction may be
4208   // reanalyzed if we don't yet know if we can sink it or not.
4209   SmallVector<Instruction *, 8> InstsToReanalyze;
4210 
4211   // Returns true if a given use occurs in the predicated block. Phi nodes use
4212   // their operands in their corresponding predecessor blocks.
4213   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4214     auto *I = cast<Instruction>(U.getUser());
4215     BasicBlock *BB = I->getParent();
4216     if (auto *Phi = dyn_cast<PHINode>(I))
4217       BB = Phi->getIncomingBlock(
4218           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4219     return BB == PredBB;
4220   };
4221 
4222   // Iteratively sink the scalarized operands of the predicated instruction
4223   // into the block we created for it. When an instruction is sunk, it's
4224   // operands are then added to the worklist. The algorithm ends after one pass
4225   // through the worklist doesn't sink a single instruction.
4226   bool Changed;
4227   do {
4228     // Add the instructions that need to be reanalyzed to the worklist, and
4229     // reset the changed indicator.
4230     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4231     InstsToReanalyze.clear();
4232     Changed = false;
4233 
4234     while (!Worklist.empty()) {
4235       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4236 
4237       // We can't sink an instruction if it is a phi node, is already in the
4238       // predicated block, is not in the loop, or may have side effects.
4239       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4240           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4241         continue;
4242 
4243       // It's legal to sink the instruction if all its uses occur in the
4244       // predicated block. Otherwise, there's nothing to do yet, and we may
4245       // need to reanalyze the instruction.
4246       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4247         InstsToReanalyze.push_back(I);
4248         continue;
4249       }
4250 
4251       // Move the instruction to the beginning of the predicated block, and add
4252       // it's operands to the worklist.
4253       I->moveBefore(&*PredBB->getFirstInsertionPt());
4254       Worklist.insert(I->op_begin(), I->op_end());
4255 
4256       // The sinking may have enabled other instructions to be sunk, so we will
4257       // need to iterate.
4258       Changed = true;
4259     }
4260   } while (Changed);
4261 }
4262 
4263 void InnerLoopVectorizer::fixNonInductionPHIs() {
4264   for (PHINode *OrigPhi : OrigPHIsToFix) {
4265     PHINode *NewPhi =
4266         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4267     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4268 
4269     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4270         predecessors(OrigPhi->getParent()));
4271     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4272         predecessors(NewPhi->getParent()));
4273     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4274            "Scalar and Vector BB should have the same number of predecessors");
4275 
4276     // The insertion point in Builder may be invalidated by the time we get
4277     // here. Force the Builder insertion point to something valid so that we do
4278     // not run into issues during insertion point restore in
4279     // getOrCreateVectorValue calls below.
4280     Builder.SetInsertPoint(NewPhi);
4281 
4282     // The predecessor order is preserved and we can rely on mapping between
4283     // scalar and vector block predecessors.
4284     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4285       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4286 
4287       // When looking up the new scalar/vector values to fix up, use incoming
4288       // values from original phi.
4289       Value *ScIncV =
4290           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4291 
4292       // Scalar incoming value may need a broadcast
4293       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4294       NewPhi->addIncoming(NewIncV, NewPredBB);
4295     }
4296   }
4297 }
4298 
4299 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4300                                    VPUser &Operands, unsigned UF,
4301                                    ElementCount VF, bool IsPtrLoopInvariant,
4302                                    SmallBitVector &IsIndexLoopInvariant,
4303                                    VPTransformState &State) {
4304   // Construct a vector GEP by widening the operands of the scalar GEP as
4305   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4306   // results in a vector of pointers when at least one operand of the GEP
4307   // is vector-typed. Thus, to keep the representation compact, we only use
4308   // vector-typed operands for loop-varying values.
4309 
4310   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4311     // If we are vectorizing, but the GEP has only loop-invariant operands,
4312     // the GEP we build (by only using vector-typed operands for
4313     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4314     // produce a vector of pointers, we need to either arbitrarily pick an
4315     // operand to broadcast, or broadcast a clone of the original GEP.
4316     // Here, we broadcast a clone of the original.
4317     //
4318     // TODO: If at some point we decide to scalarize instructions having
4319     //       loop-invariant operands, this special case will no longer be
4320     //       required. We would add the scalarization decision to
4321     //       collectLoopScalars() and teach getVectorValue() to broadcast
4322     //       the lane-zero scalar value.
4323     auto *Clone = Builder.Insert(GEP->clone());
4324     for (unsigned Part = 0; Part < UF; ++Part) {
4325       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4326       State.set(VPDef, GEP, EntryPart, Part);
4327       addMetadata(EntryPart, GEP);
4328     }
4329   } else {
4330     // If the GEP has at least one loop-varying operand, we are sure to
4331     // produce a vector of pointers. But if we are only unrolling, we want
4332     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4333     // produce with the code below will be scalar (if VF == 1) or vector
4334     // (otherwise). Note that for the unroll-only case, we still maintain
4335     // values in the vector mapping with initVector, as we do for other
4336     // instructions.
4337     for (unsigned Part = 0; Part < UF; ++Part) {
4338       // The pointer operand of the new GEP. If it's loop-invariant, we
4339       // won't broadcast it.
4340       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4341                                      : State.get(Operands.getOperand(0), Part);
4342 
4343       // Collect all the indices for the new GEP. If any index is
4344       // loop-invariant, we won't broadcast it.
4345       SmallVector<Value *, 4> Indices;
4346       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4347         VPValue *Operand = Operands.getOperand(I);
4348         if (IsIndexLoopInvariant[I - 1])
4349           Indices.push_back(State.get(Operand, {0, 0}));
4350         else
4351           Indices.push_back(State.get(Operand, Part));
4352       }
4353 
4354       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4355       // but it should be a vector, otherwise.
4356       auto *NewGEP =
4357           GEP->isInBounds()
4358               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4359                                           Indices)
4360               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4361       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4362              "NewGEP is not a pointer vector");
4363       State.set(VPDef, GEP, NewGEP, Part);
4364       addMetadata(NewGEP, GEP);
4365     }
4366   }
4367 }
4368 
4369 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4370                                               ElementCount VF) {
4371   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4372   PHINode *P = cast<PHINode>(PN);
4373   if (EnableVPlanNativePath) {
4374     // Currently we enter here in the VPlan-native path for non-induction
4375     // PHIs where all control flow is uniform. We simply widen these PHIs.
4376     // Create a vector phi with no operands - the vector phi operands will be
4377     // set at the end of vector code generation.
4378     Type *VecTy =
4379         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4380     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4381     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4382     OrigPHIsToFix.push_back(P);
4383 
4384     return;
4385   }
4386 
4387   assert(PN->getParent() == OrigLoop->getHeader() &&
4388          "Non-header phis should have been handled elsewhere");
4389 
4390   // In order to support recurrences we need to be able to vectorize Phi nodes.
4391   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4392   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4393   // this value when we vectorize all of the instructions that use the PHI.
4394   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4395     for (unsigned Part = 0; Part < UF; ++Part) {
4396       // This is phase one of vectorizing PHIs.
4397       bool ScalarPHI =
4398           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4399       Type *VecTy =
4400           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4401       Value *EntryPart = PHINode::Create(
4402           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4403       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4404     }
4405     return;
4406   }
4407 
4408   setDebugLocFromInst(Builder, P);
4409 
4410   // This PHINode must be an induction variable.
4411   // Make sure that we know about it.
4412   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4413 
4414   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4415   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4416 
4417   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4418   // which can be found from the original scalar operations.
4419   switch (II.getKind()) {
4420   case InductionDescriptor::IK_NoInduction:
4421     llvm_unreachable("Unknown induction");
4422   case InductionDescriptor::IK_IntInduction:
4423   case InductionDescriptor::IK_FpInduction:
4424     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4425   case InductionDescriptor::IK_PtrInduction: {
4426     // Handle the pointer induction variable case.
4427     assert(P->getType()->isPointerTy() && "Unexpected type.");
4428 
4429     if (Cost->isScalarAfterVectorization(P, VF)) {
4430       // This is the normalized GEP that starts counting at zero.
4431       Value *PtrInd =
4432           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4433       // Determine the number of scalars we need to generate for each unroll
4434       // iteration. If the instruction is uniform, we only need to generate the
4435       // first lane. Otherwise, we generate all VF values.
4436       unsigned Lanes =
4437           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4438       for (unsigned Part = 0; Part < UF; ++Part) {
4439         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4440           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4441                                            Lane + Part * VF.getKnownMinValue());
4442           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4443           Value *SclrGep =
4444               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4445           SclrGep->setName("next.gep");
4446           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4447         }
4448       }
4449       return;
4450     }
4451     assert(isa<SCEVConstant>(II.getStep()) &&
4452            "Induction step not a SCEV constant!");
4453     Type *PhiType = II.getStep()->getType();
4454 
4455     // Build a pointer phi
4456     Value *ScalarStartValue = II.getStartValue();
4457     Type *ScStValueType = ScalarStartValue->getType();
4458     PHINode *NewPointerPhi =
4459         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4460     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4461 
4462     // A pointer induction, performed by using a gep
4463     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4464     Instruction *InductionLoc = LoopLatch->getTerminator();
4465     const SCEV *ScalarStep = II.getStep();
4466     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4467     Value *ScalarStepValue =
4468         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4469     Value *InductionGEP = GetElementPtrInst::Create(
4470         ScStValueType->getPointerElementType(), NewPointerPhi,
4471         Builder.CreateMul(
4472             ScalarStepValue,
4473             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4474         "ptr.ind", InductionLoc);
4475     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4476 
4477     // Create UF many actual address geps that use the pointer
4478     // phi as base and a vectorized version of the step value
4479     // (<step*0, ..., step*N>) as offset.
4480     for (unsigned Part = 0; Part < UF; ++Part) {
4481       SmallVector<Constant *, 8> Indices;
4482       // Create a vector of consecutive numbers from zero to VF.
4483       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4484         Indices.push_back(
4485             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4486       Constant *StartOffset = ConstantVector::get(Indices);
4487 
4488       Value *GEP = Builder.CreateGEP(
4489           ScStValueType->getPointerElementType(), NewPointerPhi,
4490           Builder.CreateMul(
4491               StartOffset,
4492               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4493               "vector.gep"));
4494       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4495     }
4496   }
4497   }
4498 }
4499 
4500 /// A helper function for checking whether an integer division-related
4501 /// instruction may divide by zero (in which case it must be predicated if
4502 /// executed conditionally in the scalar code).
4503 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4504 /// Non-zero divisors that are non compile-time constants will not be
4505 /// converted into multiplication, so we will still end up scalarizing
4506 /// the division, but can do so w/o predication.
4507 static bool mayDivideByZero(Instruction &I) {
4508   assert((I.getOpcode() == Instruction::UDiv ||
4509           I.getOpcode() == Instruction::SDiv ||
4510           I.getOpcode() == Instruction::URem ||
4511           I.getOpcode() == Instruction::SRem) &&
4512          "Unexpected instruction");
4513   Value *Divisor = I.getOperand(1);
4514   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4515   return !CInt || CInt->isZero();
4516 }
4517 
4518 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4519                                            VPUser &User,
4520                                            VPTransformState &State) {
4521   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4522   switch (I.getOpcode()) {
4523   case Instruction::Call:
4524   case Instruction::Br:
4525   case Instruction::PHI:
4526   case Instruction::GetElementPtr:
4527   case Instruction::Select:
4528     llvm_unreachable("This instruction is handled by a different recipe.");
4529   case Instruction::UDiv:
4530   case Instruction::SDiv:
4531   case Instruction::SRem:
4532   case Instruction::URem:
4533   case Instruction::Add:
4534   case Instruction::FAdd:
4535   case Instruction::Sub:
4536   case Instruction::FSub:
4537   case Instruction::FNeg:
4538   case Instruction::Mul:
4539   case Instruction::FMul:
4540   case Instruction::FDiv:
4541   case Instruction::FRem:
4542   case Instruction::Shl:
4543   case Instruction::LShr:
4544   case Instruction::AShr:
4545   case Instruction::And:
4546   case Instruction::Or:
4547   case Instruction::Xor: {
4548     // Just widen unops and binops.
4549     setDebugLocFromInst(Builder, &I);
4550 
4551     for (unsigned Part = 0; Part < UF; ++Part) {
4552       SmallVector<Value *, 2> Ops;
4553       for (VPValue *VPOp : User.operands())
4554         Ops.push_back(State.get(VPOp, Part));
4555 
4556       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4557 
4558       if (auto *VecOp = dyn_cast<Instruction>(V))
4559         VecOp->copyIRFlags(&I);
4560 
4561       // Use this vector value for all users of the original instruction.
4562       State.set(Def, &I, V, Part);
4563       addMetadata(V, &I);
4564     }
4565 
4566     break;
4567   }
4568   case Instruction::ICmp:
4569   case Instruction::FCmp: {
4570     // Widen compares. Generate vector compares.
4571     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4572     auto *Cmp = cast<CmpInst>(&I);
4573     setDebugLocFromInst(Builder, Cmp);
4574     for (unsigned Part = 0; Part < UF; ++Part) {
4575       Value *A = State.get(User.getOperand(0), Part);
4576       Value *B = State.get(User.getOperand(1), Part);
4577       Value *C = nullptr;
4578       if (FCmp) {
4579         // Propagate fast math flags.
4580         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4581         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4582         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4583       } else {
4584         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4585       }
4586       State.set(Def, &I, C, Part);
4587       addMetadata(C, &I);
4588     }
4589 
4590     break;
4591   }
4592 
4593   case Instruction::ZExt:
4594   case Instruction::SExt:
4595   case Instruction::FPToUI:
4596   case Instruction::FPToSI:
4597   case Instruction::FPExt:
4598   case Instruction::PtrToInt:
4599   case Instruction::IntToPtr:
4600   case Instruction::SIToFP:
4601   case Instruction::UIToFP:
4602   case Instruction::Trunc:
4603   case Instruction::FPTrunc:
4604   case Instruction::BitCast: {
4605     auto *CI = cast<CastInst>(&I);
4606     setDebugLocFromInst(Builder, CI);
4607 
4608     /// Vectorize casts.
4609     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4610     Type *DestTy =
4611         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4612 
4613     for (unsigned Part = 0; Part < UF; ++Part) {
4614       Value *A = State.get(User.getOperand(0), Part);
4615       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4616       State.set(Def, &I, Cast, Part);
4617       addMetadata(Cast, &I);
4618     }
4619     break;
4620   }
4621   default:
4622     // This instruction is not vectorized by simple widening.
4623     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4624     llvm_unreachable("Unhandled instruction!");
4625   } // end of switch.
4626 }
4627 
4628 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4629                                                VPUser &ArgOperands,
4630                                                VPTransformState &State) {
4631   assert(!isa<DbgInfoIntrinsic>(I) &&
4632          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4633   setDebugLocFromInst(Builder, &I);
4634 
4635   Module *M = I.getParent()->getParent()->getParent();
4636   auto *CI = cast<CallInst>(&I);
4637 
4638   SmallVector<Type *, 4> Tys;
4639   for (Value *ArgOperand : CI->arg_operands())
4640     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4641 
4642   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4643 
4644   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4645   // version of the instruction.
4646   // Is it beneficial to perform intrinsic call compared to lib call?
4647   bool NeedToScalarize = false;
4648   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4649   bool UseVectorIntrinsic =
4650       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4651   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4652          "Instruction should be scalarized elsewhere.");
4653 
4654   for (unsigned Part = 0; Part < UF; ++Part) {
4655     SmallVector<Value *, 4> Args;
4656     for (auto &I : enumerate(ArgOperands.operands())) {
4657       // Some intrinsics have a scalar argument - don't replace it with a
4658       // vector.
4659       Value *Arg;
4660       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4661         Arg = State.get(I.value(), Part);
4662       else
4663         Arg = State.get(I.value(), {0, 0});
4664       Args.push_back(Arg);
4665     }
4666 
4667     Function *VectorF;
4668     if (UseVectorIntrinsic) {
4669       // Use vector version of the intrinsic.
4670       Type *TysForDecl[] = {CI->getType()};
4671       if (VF.isVector()) {
4672         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4673         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4674       }
4675       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4676       assert(VectorF && "Can't retrieve vector intrinsic.");
4677     } else {
4678       // Use vector version of the function call.
4679       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4680 #ifndef NDEBUG
4681       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4682              "Can't create vector function.");
4683 #endif
4684         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4685     }
4686       SmallVector<OperandBundleDef, 1> OpBundles;
4687       CI->getOperandBundlesAsDefs(OpBundles);
4688       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4689 
4690       if (isa<FPMathOperator>(V))
4691         V->copyFastMathFlags(CI);
4692 
4693       State.set(Def, &I, V, Part);
4694       addMetadata(V, &I);
4695   }
4696 }
4697 
4698 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4699                                                  VPUser &Operands,
4700                                                  bool InvariantCond,
4701                                                  VPTransformState &State) {
4702   setDebugLocFromInst(Builder, &I);
4703 
4704   // The condition can be loop invariant  but still defined inside the
4705   // loop. This means that we can't just use the original 'cond' value.
4706   // We have to take the 'vectorized' value and pick the first lane.
4707   // Instcombine will make this a no-op.
4708   auto *InvarCond =
4709       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4710 
4711   for (unsigned Part = 0; Part < UF; ++Part) {
4712     Value *Cond =
4713         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4714     Value *Op0 = State.get(Operands.getOperand(1), Part);
4715     Value *Op1 = State.get(Operands.getOperand(2), Part);
4716     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4717     State.set(VPDef, &I, Sel, Part);
4718     addMetadata(Sel, &I);
4719   }
4720 }
4721 
4722 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4723   // We should not collect Scalars more than once per VF. Right now, this
4724   // function is called from collectUniformsAndScalars(), which already does
4725   // this check. Collecting Scalars for VF=1 does not make any sense.
4726   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4727          "This function should not be visited twice for the same VF");
4728 
4729   SmallSetVector<Instruction *, 8> Worklist;
4730 
4731   // These sets are used to seed the analysis with pointers used by memory
4732   // accesses that will remain scalar.
4733   SmallSetVector<Instruction *, 8> ScalarPtrs;
4734   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4735   auto *Latch = TheLoop->getLoopLatch();
4736 
4737   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4738   // The pointer operands of loads and stores will be scalar as long as the
4739   // memory access is not a gather or scatter operation. The value operand of a
4740   // store will remain scalar if the store is scalarized.
4741   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4742     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4743     assert(WideningDecision != CM_Unknown &&
4744            "Widening decision should be ready at this moment");
4745     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4746       if (Ptr == Store->getValueOperand())
4747         return WideningDecision == CM_Scalarize;
4748     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4749            "Ptr is neither a value or pointer operand");
4750     return WideningDecision != CM_GatherScatter;
4751   };
4752 
4753   // A helper that returns true if the given value is a bitcast or
4754   // getelementptr instruction contained in the loop.
4755   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4756     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4757             isa<GetElementPtrInst>(V)) &&
4758            !TheLoop->isLoopInvariant(V);
4759   };
4760 
4761   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4762     if (!isa<PHINode>(Ptr) ||
4763         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4764       return false;
4765     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4766     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4767       return false;
4768     return isScalarUse(MemAccess, Ptr);
4769   };
4770 
4771   // A helper that evaluates a memory access's use of a pointer. If the
4772   // pointer is actually the pointer induction of a loop, it is being
4773   // inserted into Worklist. If the use will be a scalar use, and the
4774   // pointer is only used by memory accesses, we place the pointer in
4775   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4776   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4777     if (isScalarPtrInduction(MemAccess, Ptr)) {
4778       Worklist.insert(cast<Instruction>(Ptr));
4779       Instruction *Update = cast<Instruction>(
4780           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4781       Worklist.insert(Update);
4782       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4783                         << "\n");
4784       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4785                         << "\n");
4786       return;
4787     }
4788     // We only care about bitcast and getelementptr instructions contained in
4789     // the loop.
4790     if (!isLoopVaryingBitCastOrGEP(Ptr))
4791       return;
4792 
4793     // If the pointer has already been identified as scalar (e.g., if it was
4794     // also identified as uniform), there's nothing to do.
4795     auto *I = cast<Instruction>(Ptr);
4796     if (Worklist.count(I))
4797       return;
4798 
4799     // If the use of the pointer will be a scalar use, and all users of the
4800     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4801     // place the pointer in PossibleNonScalarPtrs.
4802     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4803           return isa<LoadInst>(U) || isa<StoreInst>(U);
4804         }))
4805       ScalarPtrs.insert(I);
4806     else
4807       PossibleNonScalarPtrs.insert(I);
4808   };
4809 
4810   // We seed the scalars analysis with three classes of instructions: (1)
4811   // instructions marked uniform-after-vectorization and (2) bitcast,
4812   // getelementptr and (pointer) phi instructions used by memory accesses
4813   // requiring a scalar use.
4814   //
4815   // (1) Add to the worklist all instructions that have been identified as
4816   // uniform-after-vectorization.
4817   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4818 
4819   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4820   // memory accesses requiring a scalar use. The pointer operands of loads and
4821   // stores will be scalar as long as the memory accesses is not a gather or
4822   // scatter operation. The value operand of a store will remain scalar if the
4823   // store is scalarized.
4824   for (auto *BB : TheLoop->blocks())
4825     for (auto &I : *BB) {
4826       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4827         evaluatePtrUse(Load, Load->getPointerOperand());
4828       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4829         evaluatePtrUse(Store, Store->getPointerOperand());
4830         evaluatePtrUse(Store, Store->getValueOperand());
4831       }
4832     }
4833   for (auto *I : ScalarPtrs)
4834     if (!PossibleNonScalarPtrs.count(I)) {
4835       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4836       Worklist.insert(I);
4837     }
4838 
4839   // Insert the forced scalars.
4840   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4841   // induction variable when the PHI user is scalarized.
4842   auto ForcedScalar = ForcedScalars.find(VF);
4843   if (ForcedScalar != ForcedScalars.end())
4844     for (auto *I : ForcedScalar->second)
4845       Worklist.insert(I);
4846 
4847   // Expand the worklist by looking through any bitcasts and getelementptr
4848   // instructions we've already identified as scalar. This is similar to the
4849   // expansion step in collectLoopUniforms(); however, here we're only
4850   // expanding to include additional bitcasts and getelementptr instructions.
4851   unsigned Idx = 0;
4852   while (Idx != Worklist.size()) {
4853     Instruction *Dst = Worklist[Idx++];
4854     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4855       continue;
4856     auto *Src = cast<Instruction>(Dst->getOperand(0));
4857     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4858           auto *J = cast<Instruction>(U);
4859           return !TheLoop->contains(J) || Worklist.count(J) ||
4860                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4861                   isScalarUse(J, Src));
4862         })) {
4863       Worklist.insert(Src);
4864       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4865     }
4866   }
4867 
4868   // An induction variable will remain scalar if all users of the induction
4869   // variable and induction variable update remain scalar.
4870   for (auto &Induction : Legal->getInductionVars()) {
4871     auto *Ind = Induction.first;
4872     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4873 
4874     // If tail-folding is applied, the primary induction variable will be used
4875     // to feed a vector compare.
4876     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4877       continue;
4878 
4879     // Determine if all users of the induction variable are scalar after
4880     // vectorization.
4881     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4882       auto *I = cast<Instruction>(U);
4883       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4884     });
4885     if (!ScalarInd)
4886       continue;
4887 
4888     // Determine if all users of the induction variable update instruction are
4889     // scalar after vectorization.
4890     auto ScalarIndUpdate =
4891         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4892           auto *I = cast<Instruction>(U);
4893           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4894         });
4895     if (!ScalarIndUpdate)
4896       continue;
4897 
4898     // The induction variable and its update instruction will remain scalar.
4899     Worklist.insert(Ind);
4900     Worklist.insert(IndUpdate);
4901     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4902     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4903                       << "\n");
4904   }
4905 
4906   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4907 }
4908 
4909 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
4910                                                          ElementCount VF) {
4911   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4912   if (!blockNeedsPredication(I->getParent()))
4913     return false;
4914   switch(I->getOpcode()) {
4915   default:
4916     break;
4917   case Instruction::Load:
4918   case Instruction::Store: {
4919     if (!Legal->isMaskRequired(I))
4920       return false;
4921     auto *Ptr = getLoadStorePointerOperand(I);
4922     auto *Ty = getMemInstValueType(I);
4923     // We have already decided how to vectorize this instruction, get that
4924     // result.
4925     if (VF.isVector()) {
4926       InstWidening WideningDecision = getWideningDecision(I, VF);
4927       assert(WideningDecision != CM_Unknown &&
4928              "Widening decision should be ready at this moment");
4929       return WideningDecision == CM_Scalarize;
4930     }
4931     const Align Alignment = getLoadStoreAlignment(I);
4932     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4933                                 isLegalMaskedGather(Ty, Alignment))
4934                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4935                                 isLegalMaskedScatter(Ty, Alignment));
4936   }
4937   case Instruction::UDiv:
4938   case Instruction::SDiv:
4939   case Instruction::SRem:
4940   case Instruction::URem:
4941     return mayDivideByZero(*I);
4942   }
4943   return false;
4944 }
4945 
4946 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4947     Instruction *I, ElementCount VF) {
4948   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4949   assert(getWideningDecision(I, VF) == CM_Unknown &&
4950          "Decision should not be set yet.");
4951   auto *Group = getInterleavedAccessGroup(I);
4952   assert(Group && "Must have a group.");
4953 
4954   // If the instruction's allocated size doesn't equal it's type size, it
4955   // requires padding and will be scalarized.
4956   auto &DL = I->getModule()->getDataLayout();
4957   auto *ScalarTy = getMemInstValueType(I);
4958   if (hasIrregularType(ScalarTy, DL, VF))
4959     return false;
4960 
4961   // Check if masking is required.
4962   // A Group may need masking for one of two reasons: it resides in a block that
4963   // needs predication, or it was decided to use masking to deal with gaps.
4964   bool PredicatedAccessRequiresMasking =
4965       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4966   bool AccessWithGapsRequiresMasking =
4967       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4968   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4969     return true;
4970 
4971   // If masked interleaving is required, we expect that the user/target had
4972   // enabled it, because otherwise it either wouldn't have been created or
4973   // it should have been invalidated by the CostModel.
4974   assert(useMaskedInterleavedAccesses(TTI) &&
4975          "Masked interleave-groups for predicated accesses are not enabled.");
4976 
4977   auto *Ty = getMemInstValueType(I);
4978   const Align Alignment = getLoadStoreAlignment(I);
4979   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4980                           : TTI.isLegalMaskedStore(Ty, Alignment);
4981 }
4982 
4983 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4984     Instruction *I, ElementCount VF) {
4985   // Get and ensure we have a valid memory instruction.
4986   LoadInst *LI = dyn_cast<LoadInst>(I);
4987   StoreInst *SI = dyn_cast<StoreInst>(I);
4988   assert((LI || SI) && "Invalid memory instruction");
4989 
4990   auto *Ptr = getLoadStorePointerOperand(I);
4991 
4992   // In order to be widened, the pointer should be consecutive, first of all.
4993   if (!Legal->isConsecutivePtr(Ptr))
4994     return false;
4995 
4996   // If the instruction is a store located in a predicated block, it will be
4997   // scalarized.
4998   if (isScalarWithPredication(I))
4999     return false;
5000 
5001   // If the instruction's allocated size doesn't equal it's type size, it
5002   // requires padding and will be scalarized.
5003   auto &DL = I->getModule()->getDataLayout();
5004   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5005   if (hasIrregularType(ScalarTy, DL, VF))
5006     return false;
5007 
5008   return true;
5009 }
5010 
5011 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5012   // We should not collect Uniforms more than once per VF. Right now,
5013   // this function is called from collectUniformsAndScalars(), which
5014   // already does this check. Collecting Uniforms for VF=1 does not make any
5015   // sense.
5016 
5017   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5018          "This function should not be visited twice for the same VF");
5019 
5020   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5021   // not analyze again.  Uniforms.count(VF) will return 1.
5022   Uniforms[VF].clear();
5023 
5024   // We now know that the loop is vectorizable!
5025   // Collect instructions inside the loop that will remain uniform after
5026   // vectorization.
5027 
5028   // Global values, params and instructions outside of current loop are out of
5029   // scope.
5030   auto isOutOfScope = [&](Value *V) -> bool {
5031     Instruction *I = dyn_cast<Instruction>(V);
5032     return (!I || !TheLoop->contains(I));
5033   };
5034 
5035   SetVector<Instruction *> Worklist;
5036   BasicBlock *Latch = TheLoop->getLoopLatch();
5037 
5038   // Instructions that are scalar with predication must not be considered
5039   // uniform after vectorization, because that would create an erroneous
5040   // replicating region where only a single instance out of VF should be formed.
5041   // TODO: optimize such seldom cases if found important, see PR40816.
5042   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5043     if (isOutOfScope(I)) {
5044       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5045                         << *I << "\n");
5046       return;
5047     }
5048     if (isScalarWithPredication(I, VF)) {
5049       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5050                         << *I << "\n");
5051       return;
5052     }
5053     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5054     Worklist.insert(I);
5055   };
5056 
5057   // Start with the conditional branch. If the branch condition is an
5058   // instruction contained in the loop that is only used by the branch, it is
5059   // uniform.
5060   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5061   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5062     addToWorklistIfAllowed(Cmp);
5063 
5064   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5065   // are pointers that are treated like consecutive pointers during
5066   // vectorization. The pointer operands of interleaved accesses are an
5067   // example.
5068   SmallSetVector<Value *, 8> ConsecutiveLikePtrs;
5069 
5070   // Holds pointer operands of instructions that are possibly non-uniform.
5071   SmallPtrSet<Value *, 8> PossibleNonUniformPtrs;
5072 
5073   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5074     InstWidening WideningDecision = getWideningDecision(I, VF);
5075     assert(WideningDecision != CM_Unknown &&
5076            "Widening decision should be ready at this moment");
5077 
5078     // The address of a uniform mem op is itself uniform.  We exclude stores
5079     // here as there's an assumption in the current code that all uses of
5080     // uniform instructions are uniform and, as noted below, uniform stores are
5081     // still handled via replication (i.e. aren't uniform after vectorization).
5082     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5083       assert(WideningDecision == CM_Scalarize);
5084       return true;
5085     }
5086 
5087     return (WideningDecision == CM_Widen ||
5088             WideningDecision == CM_Widen_Reverse ||
5089             WideningDecision == CM_Interleave);
5090   };
5091 
5092 
5093   // Returns true if Ptr is the pointer operand of a memory access instruction
5094   // I, and I is known to not require scalarization.
5095   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5096     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5097   };
5098 
5099   // Iterate over the instructions in the loop, and collect all
5100   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5101   // that a consecutive-like pointer operand will be scalarized, we collect it
5102   // in PossibleNonUniformPtrs instead. We use two sets here because a single
5103   // getelementptr instruction can be used by both vectorized and scalarized
5104   // memory instructions. For example, if a loop loads and stores from the same
5105   // location, but the store is conditional, the store will be scalarized, and
5106   // the getelementptr won't remain uniform.
5107   for (auto *BB : TheLoop->blocks())
5108     for (auto &I : *BB) {
5109       // If there's no pointer operand, there's nothing to do.
5110       auto *Ptr = getLoadStorePointerOperand(&I);
5111       if (!Ptr)
5112         continue;
5113 
5114       // For now, avoid walking use lists in other functions.
5115       // TODO: Rewrite this algorithm from uses up.
5116       if (!isa<Instruction>(Ptr) && !isa<Argument>(Ptr))
5117         continue;
5118 
5119       // A uniform memory op is itself uniform.  We exclude stores here as we
5120       // haven't yet added dedicated logic in the CLONE path and rely on
5121       // REPLICATE + DSE for correctness.
5122       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5123         addToWorklistIfAllowed(&I);
5124 
5125       // True if all users of Ptr are memory accesses that have Ptr as their
5126       // pointer operand.  Since loops are assumed to be in LCSSA form, this
5127       // disallows uses outside the loop as well.
5128       auto UsersAreMemAccesses =
5129           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
5130             return getLoadStorePointerOperand(U) == Ptr;
5131           });
5132 
5133       // Ensure the memory instruction will not be scalarized or used by
5134       // gather/scatter, making its pointer operand non-uniform. If the pointer
5135       // operand is used by any instruction other than a memory access, we
5136       // conservatively assume the pointer operand may be non-uniform.
5137       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
5138         PossibleNonUniformPtrs.insert(Ptr);
5139 
5140       // If the memory instruction will be vectorized and its pointer operand
5141       // is consecutive-like, or interleaving - the pointer operand should
5142       // remain uniform.
5143       else
5144         ConsecutiveLikePtrs.insert(Ptr);
5145     }
5146 
5147   // Add to the Worklist all consecutive and consecutive-like pointers that
5148   // aren't also identified as possibly non-uniform.
5149   for (auto *V : ConsecutiveLikePtrs)
5150     if (!PossibleNonUniformPtrs.count(V))
5151       if (auto *I = dyn_cast<Instruction>(V))
5152         addToWorklistIfAllowed(I);
5153 
5154   // Expand Worklist in topological order: whenever a new instruction
5155   // is added , its users should be already inside Worklist.  It ensures
5156   // a uniform instruction will only be used by uniform instructions.
5157   unsigned idx = 0;
5158   while (idx != Worklist.size()) {
5159     Instruction *I = Worklist[idx++];
5160 
5161     for (auto OV : I->operand_values()) {
5162       // isOutOfScope operands cannot be uniform instructions.
5163       if (isOutOfScope(OV))
5164         continue;
5165       // First order recurrence Phi's should typically be considered
5166       // non-uniform.
5167       auto *OP = dyn_cast<PHINode>(OV);
5168       if (OP && Legal->isFirstOrderRecurrence(OP))
5169         continue;
5170       // If all the users of the operand are uniform, then add the
5171       // operand into the uniform worklist.
5172       auto *OI = cast<Instruction>(OV);
5173       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5174             auto *J = cast<Instruction>(U);
5175             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5176           }))
5177         addToWorklistIfAllowed(OI);
5178     }
5179   }
5180 
5181   // For an instruction to be added into Worklist above, all its users inside
5182   // the loop should also be in Worklist. However, this condition cannot be
5183   // true for phi nodes that form a cyclic dependence. We must process phi
5184   // nodes separately. An induction variable will remain uniform if all users
5185   // of the induction variable and induction variable update remain uniform.
5186   // The code below handles both pointer and non-pointer induction variables.
5187   for (auto &Induction : Legal->getInductionVars()) {
5188     auto *Ind = Induction.first;
5189     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5190 
5191     // Determine if all users of the induction variable are uniform after
5192     // vectorization.
5193     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5194       auto *I = cast<Instruction>(U);
5195       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5196              isVectorizedMemAccessUse(I, Ind);
5197     });
5198     if (!UniformInd)
5199       continue;
5200 
5201     // Determine if all users of the induction variable update instruction are
5202     // uniform after vectorization.
5203     auto UniformIndUpdate =
5204         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5205           auto *I = cast<Instruction>(U);
5206           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5207                  isVectorizedMemAccessUse(I, IndUpdate);
5208         });
5209     if (!UniformIndUpdate)
5210       continue;
5211 
5212     // The induction variable and its update instruction will remain uniform.
5213     addToWorklistIfAllowed(Ind);
5214     addToWorklistIfAllowed(IndUpdate);
5215   }
5216 
5217   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5218 }
5219 
5220 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5221   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5222 
5223   if (Legal->getRuntimePointerChecking()->Need) {
5224     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5225         "runtime pointer checks needed. Enable vectorization of this "
5226         "loop with '#pragma clang loop vectorize(enable)' when "
5227         "compiling with -Os/-Oz",
5228         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5229     return true;
5230   }
5231 
5232   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5233     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5234         "runtime SCEV checks needed. Enable vectorization of this "
5235         "loop with '#pragma clang loop vectorize(enable)' when "
5236         "compiling with -Os/-Oz",
5237         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5238     return true;
5239   }
5240 
5241   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5242   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5243     reportVectorizationFailure("Runtime stride check for small trip count",
5244         "runtime stride == 1 checks needed. Enable vectorization of "
5245         "this loop without such check by compiling with -Os/-Oz",
5246         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5247     return true;
5248   }
5249 
5250   return false;
5251 }
5252 
5253 Optional<ElementCount>
5254 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5255   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5256     // TODO: It may by useful to do since it's still likely to be dynamically
5257     // uniform if the target can skip.
5258     reportVectorizationFailure(
5259         "Not inserting runtime ptr check for divergent target",
5260         "runtime pointer checks needed. Not enabled for divergent target",
5261         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5262     return None;
5263   }
5264 
5265   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5266   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5267   if (TC == 1) {
5268     reportVectorizationFailure("Single iteration (non) loop",
5269         "loop trip count is one, irrelevant for vectorization",
5270         "SingleIterationLoop", ORE, TheLoop);
5271     return None;
5272   }
5273 
5274   ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5275 
5276   switch (ScalarEpilogueStatus) {
5277   case CM_ScalarEpilogueAllowed:
5278     return MaxVF;
5279   case CM_ScalarEpilogueNotNeededUsePredicate:
5280     LLVM_DEBUG(
5281         dbgs() << "LV: vector predicate hint/switch found.\n"
5282                << "LV: Not allowing scalar epilogue, creating predicated "
5283                << "vector loop.\n");
5284     break;
5285   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5286     // fallthrough as a special case of OptForSize
5287   case CM_ScalarEpilogueNotAllowedOptSize:
5288     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5289       LLVM_DEBUG(
5290           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5291     else
5292       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5293                         << "count.\n");
5294 
5295     // Bail if runtime checks are required, which are not good when optimising
5296     // for size.
5297     if (runtimeChecksRequired())
5298       return None;
5299     break;
5300   }
5301 
5302   // Now try the tail folding
5303 
5304   // Invalidate interleave groups that require an epilogue if we can't mask
5305   // the interleave-group.
5306   if (!useMaskedInterleavedAccesses(TTI)) {
5307     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5308            "No decisions should have been taken at this point");
5309     // Note: There is no need to invalidate any cost modeling decisions here, as
5310     // non where taken so far.
5311     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5312   }
5313 
5314   assert(!MaxVF.isScalable() &&
5315          "Scalable vectors do not yet support tail folding");
5316   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5317          "MaxVF must be a power of 2");
5318   unsigned MaxVFtimesIC =
5319       UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5320   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5321     // Accept MaxVF if we do not have a tail.
5322     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5323     return MaxVF;
5324   }
5325 
5326   // If we don't know the precise trip count, or if the trip count that we
5327   // found modulo the vectorization factor is not zero, try to fold the tail
5328   // by masking.
5329   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5330   if (Legal->prepareToFoldTailByMasking()) {
5331     FoldTailByMasking = true;
5332     return MaxVF;
5333   }
5334 
5335   // If there was a tail-folding hint/switch, but we can't fold the tail by
5336   // masking, fallback to a vectorization with a scalar epilogue.
5337   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5338     if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) {
5339       LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5340       return None;
5341     }
5342     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5343                          "scalar epilogue instead.\n");
5344     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5345     return MaxVF;
5346   }
5347 
5348   if (TC == 0) {
5349     reportVectorizationFailure(
5350         "Unable to calculate the loop count due to complex control flow",
5351         "unable to calculate the loop count due to complex control flow",
5352         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5353     return None;
5354   }
5355 
5356   reportVectorizationFailure(
5357       "Cannot optimize for size and vectorize at the same time.",
5358       "cannot optimize for size and vectorize at the same time. "
5359       "Enable vectorization of this loop with '#pragma clang loop "
5360       "vectorize(enable)' when compiling with -Os/-Oz",
5361       "NoTailLoopWithOptForSize", ORE, TheLoop);
5362   return None;
5363 }
5364 
5365 ElementCount
5366 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5367                                                  ElementCount UserVF) {
5368   assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
5369   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5370   unsigned SmallestType, WidestType;
5371   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5372   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5373 
5374   // Get the maximum safe dependence distance in bits computed by LAA.
5375   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5376   // the memory accesses that is most restrictive (involved in the smallest
5377   // dependence distance).
5378   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5379 
5380   if (UserVF.isNonZero()) {
5381     // If legally unsafe, clamp the user vectorization factor to a safe value.
5382     unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5383     if (UserVF.getFixedValue() <= MaxSafeVF)
5384       return UserVF;
5385 
5386     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5387                       << " is unsafe, clamping to max safe VF=" << MaxSafeVF
5388                       << ".\n");
5389     ORE->emit([&]() {
5390       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5391                                         TheLoop->getStartLoc(),
5392                                         TheLoop->getHeader())
5393              << "User-specified vectorization factor "
5394              << ore::NV("UserVectorizationFactor", UserVF)
5395              << " is unsafe, clamping to maximum safe vectorization factor "
5396              << ore::NV("VectorizationFactor", MaxSafeVF);
5397     });
5398     return ElementCount::getFixed(MaxSafeVF);
5399   }
5400 
5401   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5402 
5403   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5404   // Note that both WidestRegister and WidestType may not be a powers of 2.
5405   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5406 
5407   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5408                     << " / " << WidestType << " bits.\n");
5409   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5410                     << WidestRegister << " bits.\n");
5411 
5412   assert(MaxVectorSize <= WidestRegister &&
5413          "Did not expect to pack so many elements"
5414          " into one vector!");
5415   if (MaxVectorSize == 0) {
5416     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5417     MaxVectorSize = 1;
5418     return ElementCount::getFixed(MaxVectorSize);
5419   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5420              isPowerOf2_32(ConstTripCount)) {
5421     // We need to clamp the VF to be the ConstTripCount. There is no point in
5422     // choosing a higher viable VF as done in the loop below.
5423     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5424                       << ConstTripCount << "\n");
5425     MaxVectorSize = ConstTripCount;
5426     return ElementCount::getFixed(MaxVectorSize);
5427   }
5428 
5429   unsigned MaxVF = MaxVectorSize;
5430   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5431       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5432     // Collect all viable vectorization factors larger than the default MaxVF
5433     // (i.e. MaxVectorSize).
5434     SmallVector<ElementCount, 8> VFs;
5435     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5436     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5437       VFs.push_back(ElementCount::getFixed(VS));
5438 
5439     // For each VF calculate its register usage.
5440     auto RUs = calculateRegisterUsage(VFs);
5441 
5442     // Select the largest VF which doesn't require more registers than existing
5443     // ones.
5444     for (int i = RUs.size() - 1; i >= 0; --i) {
5445       bool Selected = true;
5446       for (auto& pair : RUs[i].MaxLocalUsers) {
5447         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5448         if (pair.second > TargetNumRegisters)
5449           Selected = false;
5450       }
5451       if (Selected) {
5452         MaxVF = VFs[i].getKnownMinValue();
5453         break;
5454       }
5455     }
5456     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5457       if (MaxVF < MinVF) {
5458         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5459                           << ") with target's minimum: " << MinVF << '\n');
5460         MaxVF = MinVF;
5461       }
5462     }
5463   }
5464   return ElementCount::getFixed(MaxVF);
5465 }
5466 
5467 VectorizationFactor
5468 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5469   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5470 
5471   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5472   const float ScalarCost = Cost;
5473   unsigned Width = 1;
5474   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5475 
5476   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5477   if (ForceVectorization && MaxVF.isVector()) {
5478     // Ignore scalar width, because the user explicitly wants vectorization.
5479     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5480     // evaluation.
5481     Cost = std::numeric_limits<float>::max();
5482   }
5483 
5484   for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
5485     // Notice that the vector loop needs to be executed less times, so
5486     // we need to divide the cost of the vector loops by the width of
5487     // the vector elements.
5488     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5489     float VectorCost = C.first / (float)i;
5490     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5491                       << " costs: " << (int)VectorCost << ".\n");
5492     if (!C.second && !ForceVectorization) {
5493       LLVM_DEBUG(
5494           dbgs() << "LV: Not considering vector loop of width " << i
5495                  << " because it will not generate any vector instructions.\n");
5496       continue;
5497     }
5498     if (VectorCost < Cost) {
5499       Cost = VectorCost;
5500       Width = i;
5501     }
5502   }
5503 
5504   if (!EnableCondStoresVectorization && NumPredStores) {
5505     reportVectorizationFailure("There are conditional stores.",
5506         "store that is conditionally executed prevents vectorization",
5507         "ConditionalStore", ORE, TheLoop);
5508     Width = 1;
5509     Cost = ScalarCost;
5510   }
5511 
5512   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5513              << "LV: Vectorization seems to be not beneficial, "
5514              << "but was forced by a user.\n");
5515   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5516   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5517                                 (unsigned)(Width * Cost)};
5518   return Factor;
5519 }
5520 
5521 std::pair<unsigned, unsigned>
5522 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5523   unsigned MinWidth = -1U;
5524   unsigned MaxWidth = 8;
5525   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5526 
5527   // For each block.
5528   for (BasicBlock *BB : TheLoop->blocks()) {
5529     // For each instruction in the loop.
5530     for (Instruction &I : BB->instructionsWithoutDebug()) {
5531       Type *T = I.getType();
5532 
5533       // Skip ignored values.
5534       if (ValuesToIgnore.count(&I))
5535         continue;
5536 
5537       // Only examine Loads, Stores and PHINodes.
5538       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5539         continue;
5540 
5541       // Examine PHI nodes that are reduction variables. Update the type to
5542       // account for the recurrence type.
5543       if (auto *PN = dyn_cast<PHINode>(&I)) {
5544         if (!Legal->isReductionVariable(PN))
5545           continue;
5546         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5547         T = RdxDesc.getRecurrenceType();
5548       }
5549 
5550       // Examine the stored values.
5551       if (auto *ST = dyn_cast<StoreInst>(&I))
5552         T = ST->getValueOperand()->getType();
5553 
5554       // Ignore loaded pointer types and stored pointer types that are not
5555       // vectorizable.
5556       //
5557       // FIXME: The check here attempts to predict whether a load or store will
5558       //        be vectorized. We only know this for certain after a VF has
5559       //        been selected. Here, we assume that if an access can be
5560       //        vectorized, it will be. We should also look at extending this
5561       //        optimization to non-pointer types.
5562       //
5563       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5564           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5565         continue;
5566 
5567       MinWidth = std::min(MinWidth,
5568                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5569       MaxWidth = std::max(MaxWidth,
5570                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5571     }
5572   }
5573 
5574   return {MinWidth, MaxWidth};
5575 }
5576 
5577 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5578                                                            unsigned LoopCost) {
5579   // -- The interleave heuristics --
5580   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5581   // There are many micro-architectural considerations that we can't predict
5582   // at this level. For example, frontend pressure (on decode or fetch) due to
5583   // code size, or the number and capabilities of the execution ports.
5584   //
5585   // We use the following heuristics to select the interleave count:
5586   // 1. If the code has reductions, then we interleave to break the cross
5587   // iteration dependency.
5588   // 2. If the loop is really small, then we interleave to reduce the loop
5589   // overhead.
5590   // 3. We don't interleave if we think that we will spill registers to memory
5591   // due to the increased register pressure.
5592 
5593   if (!isScalarEpilogueAllowed())
5594     return 1;
5595 
5596   // We used the distance for the interleave count.
5597   if (Legal->getMaxSafeDepDistBytes() != -1U)
5598     return 1;
5599 
5600   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5601   const bool HasReductions = !Legal->getReductionVars().empty();
5602   // Do not interleave loops with a relatively small known or estimated trip
5603   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5604   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5605   // because with the above conditions interleaving can expose ILP and break
5606   // cross iteration dependences for reductions.
5607   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5608       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5609     return 1;
5610 
5611   RegisterUsage R = calculateRegisterUsage({VF})[0];
5612   // We divide by these constants so assume that we have at least one
5613   // instruction that uses at least one register.
5614   for (auto& pair : R.MaxLocalUsers) {
5615     pair.second = std::max(pair.second, 1U);
5616   }
5617 
5618   // We calculate the interleave count using the following formula.
5619   // Subtract the number of loop invariants from the number of available
5620   // registers. These registers are used by all of the interleaved instances.
5621   // Next, divide the remaining registers by the number of registers that is
5622   // required by the loop, in order to estimate how many parallel instances
5623   // fit without causing spills. All of this is rounded down if necessary to be
5624   // a power of two. We want power of two interleave count to simplify any
5625   // addressing operations or alignment considerations.
5626   // We also want power of two interleave counts to ensure that the induction
5627   // variable of the vector loop wraps to zero, when tail is folded by masking;
5628   // this currently happens when OptForSize, in which case IC is set to 1 above.
5629   unsigned IC = UINT_MAX;
5630 
5631   for (auto& pair : R.MaxLocalUsers) {
5632     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5633     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5634                       << " registers of "
5635                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5636     if (VF.isScalar()) {
5637       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5638         TargetNumRegisters = ForceTargetNumScalarRegs;
5639     } else {
5640       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5641         TargetNumRegisters = ForceTargetNumVectorRegs;
5642     }
5643     unsigned MaxLocalUsers = pair.second;
5644     unsigned LoopInvariantRegs = 0;
5645     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5646       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5647 
5648     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5649     // Don't count the induction variable as interleaved.
5650     if (EnableIndVarRegisterHeur) {
5651       TmpIC =
5652           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5653                         std::max(1U, (MaxLocalUsers - 1)));
5654     }
5655 
5656     IC = std::min(IC, TmpIC);
5657   }
5658 
5659   // Clamp the interleave ranges to reasonable counts.
5660   assert(!VF.isScalable() && "scalable vectors not yet supported.");
5661   unsigned MaxInterleaveCount =
5662       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5663 
5664   // Check if the user has overridden the max.
5665   if (VF.isScalar()) {
5666     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5667       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5668   } else {
5669     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5670       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5671   }
5672 
5673   // If trip count is known or estimated compile time constant, limit the
5674   // interleave count to be less than the trip count divided by VF, provided it
5675   // is at least 1.
5676   if (BestKnownTC) {
5677     MaxInterleaveCount =
5678         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5679     // Make sure MaxInterleaveCount is greater than 0.
5680     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5681   }
5682 
5683   assert(MaxInterleaveCount > 0 &&
5684          "Maximum interleave count must be greater than 0");
5685 
5686   // Clamp the calculated IC to be between the 1 and the max interleave count
5687   // that the target and trip count allows.
5688   if (IC > MaxInterleaveCount)
5689     IC = MaxInterleaveCount;
5690   else
5691     // Make sure IC is greater than 0.
5692     IC = std::max(1u, IC);
5693 
5694   assert(IC > 0 && "Interleave count must be greater than 0.");
5695 
5696   // If we did not calculate the cost for VF (because the user selected the VF)
5697   // then we calculate the cost of VF here.
5698   if (LoopCost == 0)
5699     LoopCost = expectedCost(VF).first;
5700 
5701   assert(LoopCost && "Non-zero loop cost expected");
5702 
5703   // Interleave if we vectorized this loop and there is a reduction that could
5704   // benefit from interleaving.
5705   if (VF.isVector() && HasReductions) {
5706     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5707     return IC;
5708   }
5709 
5710   // Note that if we've already vectorized the loop we will have done the
5711   // runtime check and so interleaving won't require further checks.
5712   bool InterleavingRequiresRuntimePointerCheck =
5713       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5714 
5715   // We want to interleave small loops in order to reduce the loop overhead and
5716   // potentially expose ILP opportunities.
5717   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5718                     << "LV: IC is " << IC << '\n'
5719                     << "LV: VF is " << VF.getKnownMinValue() << '\n');
5720   const bool AggressivelyInterleaveReductions =
5721       TTI.enableAggressiveInterleaving(HasReductions);
5722   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5723     // We assume that the cost overhead is 1 and we use the cost model
5724     // to estimate the cost of the loop and interleave until the cost of the
5725     // loop overhead is about 5% of the cost of the loop.
5726     unsigned SmallIC =
5727         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5728 
5729     // Interleave until store/load ports (estimated by max interleave count) are
5730     // saturated.
5731     unsigned NumStores = Legal->getNumStores();
5732     unsigned NumLoads = Legal->getNumLoads();
5733     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5734     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5735 
5736     // If we have a scalar reduction (vector reductions are already dealt with
5737     // by this point), we can increase the critical path length if the loop
5738     // we're interleaving is inside another loop. Limit, by default to 2, so the
5739     // critical path only gets increased by one reduction operation.
5740     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5741       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5742       SmallIC = std::min(SmallIC, F);
5743       StoresIC = std::min(StoresIC, F);
5744       LoadsIC = std::min(LoadsIC, F);
5745     }
5746 
5747     if (EnableLoadStoreRuntimeInterleave &&
5748         std::max(StoresIC, LoadsIC) > SmallIC) {
5749       LLVM_DEBUG(
5750           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5751       return std::max(StoresIC, LoadsIC);
5752     }
5753 
5754     // If there are scalar reductions and TTI has enabled aggressive
5755     // interleaving for reductions, we will interleave to expose ILP.
5756     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5757         AggressivelyInterleaveReductions) {
5758       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5759       // Interleave no less than SmallIC but not as aggressive as the normal IC
5760       // to satisfy the rare situation when resources are too limited.
5761       return std::max(IC / 2, SmallIC);
5762     } else {
5763       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5764       return SmallIC;
5765     }
5766   }
5767 
5768   // Interleave if this is a large loop (small loops are already dealt with by
5769   // this point) that could benefit from interleaving.
5770   if (AggressivelyInterleaveReductions) {
5771     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5772     return IC;
5773   }
5774 
5775   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5776   return 1;
5777 }
5778 
5779 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5780 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5781   // This function calculates the register usage by measuring the highest number
5782   // of values that are alive at a single location. Obviously, this is a very
5783   // rough estimation. We scan the loop in a topological order in order and
5784   // assign a number to each instruction. We use RPO to ensure that defs are
5785   // met before their users. We assume that each instruction that has in-loop
5786   // users starts an interval. We record every time that an in-loop value is
5787   // used, so we have a list of the first and last occurrences of each
5788   // instruction. Next, we transpose this data structure into a multi map that
5789   // holds the list of intervals that *end* at a specific location. This multi
5790   // map allows us to perform a linear search. We scan the instructions linearly
5791   // and record each time that a new interval starts, by placing it in a set.
5792   // If we find this value in the multi-map then we remove it from the set.
5793   // The max register usage is the maximum size of the set.
5794   // We also search for instructions that are defined outside the loop, but are
5795   // used inside the loop. We need this number separately from the max-interval
5796   // usage number because when we unroll, loop-invariant values do not take
5797   // more register.
5798   LoopBlocksDFS DFS(TheLoop);
5799   DFS.perform(LI);
5800 
5801   RegisterUsage RU;
5802 
5803   // Each 'key' in the map opens a new interval. The values
5804   // of the map are the index of the 'last seen' usage of the
5805   // instruction that is the key.
5806   using IntervalMap = DenseMap<Instruction *, unsigned>;
5807 
5808   // Maps instruction to its index.
5809   SmallVector<Instruction *, 64> IdxToInstr;
5810   // Marks the end of each interval.
5811   IntervalMap EndPoint;
5812   // Saves the list of instruction indices that are used in the loop.
5813   SmallPtrSet<Instruction *, 8> Ends;
5814   // Saves the list of values that are used in the loop but are
5815   // defined outside the loop, such as arguments and constants.
5816   SmallPtrSet<Value *, 8> LoopInvariants;
5817 
5818   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5819     for (Instruction &I : BB->instructionsWithoutDebug()) {
5820       IdxToInstr.push_back(&I);
5821 
5822       // Save the end location of each USE.
5823       for (Value *U : I.operands()) {
5824         auto *Instr = dyn_cast<Instruction>(U);
5825 
5826         // Ignore non-instruction values such as arguments, constants, etc.
5827         if (!Instr)
5828           continue;
5829 
5830         // If this instruction is outside the loop then record it and continue.
5831         if (!TheLoop->contains(Instr)) {
5832           LoopInvariants.insert(Instr);
5833           continue;
5834         }
5835 
5836         // Overwrite previous end points.
5837         EndPoint[Instr] = IdxToInstr.size();
5838         Ends.insert(Instr);
5839       }
5840     }
5841   }
5842 
5843   // Saves the list of intervals that end with the index in 'key'.
5844   using InstrList = SmallVector<Instruction *, 2>;
5845   DenseMap<unsigned, InstrList> TransposeEnds;
5846 
5847   // Transpose the EndPoints to a list of values that end at each index.
5848   for (auto &Interval : EndPoint)
5849     TransposeEnds[Interval.second].push_back(Interval.first);
5850 
5851   SmallPtrSet<Instruction *, 8> OpenIntervals;
5852   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5853   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5854 
5855   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5856 
5857   // A lambda that gets the register usage for the given type and VF.
5858   const auto &TTICapture = TTI;
5859   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
5860     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5861       return 0U;
5862     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5863   };
5864 
5865   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5866     Instruction *I = IdxToInstr[i];
5867 
5868     // Remove all of the instructions that end at this location.
5869     InstrList &List = TransposeEnds[i];
5870     for (Instruction *ToRemove : List)
5871       OpenIntervals.erase(ToRemove);
5872 
5873     // Ignore instructions that are never used within the loop.
5874     if (!Ends.count(I))
5875       continue;
5876 
5877     // Skip ignored values.
5878     if (ValuesToIgnore.count(I))
5879       continue;
5880 
5881     // For each VF find the maximum usage of registers.
5882     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5883       // Count the number of live intervals.
5884       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5885 
5886       if (VFs[j].isScalar()) {
5887         for (auto Inst : OpenIntervals) {
5888           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5889           if (RegUsage.find(ClassID) == RegUsage.end())
5890             RegUsage[ClassID] = 1;
5891           else
5892             RegUsage[ClassID] += 1;
5893         }
5894       } else {
5895         collectUniformsAndScalars(VFs[j]);
5896         for (auto Inst : OpenIntervals) {
5897           // Skip ignored values for VF > 1.
5898           if (VecValuesToIgnore.count(Inst))
5899             continue;
5900           if (isScalarAfterVectorization(Inst, VFs[j])) {
5901             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5902             if (RegUsage.find(ClassID) == RegUsage.end())
5903               RegUsage[ClassID] = 1;
5904             else
5905               RegUsage[ClassID] += 1;
5906           } else {
5907             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5908             if (RegUsage.find(ClassID) == RegUsage.end())
5909               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5910             else
5911               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5912           }
5913         }
5914       }
5915 
5916       for (auto& pair : RegUsage) {
5917         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5918           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5919         else
5920           MaxUsages[j][pair.first] = pair.second;
5921       }
5922     }
5923 
5924     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5925                       << OpenIntervals.size() << '\n');
5926 
5927     // Add the current instruction to the list of open intervals.
5928     OpenIntervals.insert(I);
5929   }
5930 
5931   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5932     SmallMapVector<unsigned, unsigned, 4> Invariant;
5933 
5934     for (auto Inst : LoopInvariants) {
5935       unsigned Usage =
5936           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5937       unsigned ClassID =
5938           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
5939       if (Invariant.find(ClassID) == Invariant.end())
5940         Invariant[ClassID] = Usage;
5941       else
5942         Invariant[ClassID] += Usage;
5943     }
5944 
5945     LLVM_DEBUG({
5946       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5947       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5948              << " item\n";
5949       for (const auto &pair : MaxUsages[i]) {
5950         dbgs() << "LV(REG): RegisterClass: "
5951                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5952                << " registers\n";
5953       }
5954       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5955              << " item\n";
5956       for (const auto &pair : Invariant) {
5957         dbgs() << "LV(REG): RegisterClass: "
5958                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5959                << " registers\n";
5960       }
5961     });
5962 
5963     RU.LoopInvariantRegs = Invariant;
5964     RU.MaxLocalUsers = MaxUsages[i];
5965     RUs[i] = RU;
5966   }
5967 
5968   return RUs;
5969 }
5970 
5971 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5972   // TODO: Cost model for emulated masked load/store is completely
5973   // broken. This hack guides the cost model to use an artificially
5974   // high enough value to practically disable vectorization with such
5975   // operations, except where previously deployed legality hack allowed
5976   // using very low cost values. This is to avoid regressions coming simply
5977   // from moving "masked load/store" check from legality to cost model.
5978   // Masked Load/Gather emulation was previously never allowed.
5979   // Limited number of Masked Store/Scatter emulation was allowed.
5980   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5981   return isa<LoadInst>(I) ||
5982          (isa<StoreInst>(I) &&
5983           NumPredStores > NumberOfStoresToPredicate);
5984 }
5985 
5986 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5987   // If we aren't vectorizing the loop, or if we've already collected the
5988   // instructions to scalarize, there's nothing to do. Collection may already
5989   // have occurred if we have a user-selected VF and are now computing the
5990   // expected cost for interleaving.
5991   if (VF.isScalar() || VF.isZero() ||
5992       InstsToScalarize.find(VF) != InstsToScalarize.end())
5993     return;
5994 
5995   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5996   // not profitable to scalarize any instructions, the presence of VF in the
5997   // map will indicate that we've analyzed it already.
5998   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5999 
6000   // Find all the instructions that are scalar with predication in the loop and
6001   // determine if it would be better to not if-convert the blocks they are in.
6002   // If so, we also record the instructions to scalarize.
6003   for (BasicBlock *BB : TheLoop->blocks()) {
6004     if (!blockNeedsPredication(BB))
6005       continue;
6006     for (Instruction &I : *BB)
6007       if (isScalarWithPredication(&I)) {
6008         ScalarCostsTy ScalarCosts;
6009         // Do not apply discount logic if hacked cost is needed
6010         // for emulated masked memrefs.
6011         if (!useEmulatedMaskMemRefHack(&I) &&
6012             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6013           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6014         // Remember that BB will remain after vectorization.
6015         PredicatedBBsAfterVectorization.insert(BB);
6016       }
6017   }
6018 }
6019 
6020 int LoopVectorizationCostModel::computePredInstDiscount(
6021     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
6022     ElementCount VF) {
6023   assert(!isUniformAfterVectorization(PredInst, VF) &&
6024          "Instruction marked uniform-after-vectorization will be predicated");
6025 
6026   // Initialize the discount to zero, meaning that the scalar version and the
6027   // vector version cost the same.
6028   int Discount = 0;
6029 
6030   // Holds instructions to analyze. The instructions we visit are mapped in
6031   // ScalarCosts. Those instructions are the ones that would be scalarized if
6032   // we find that the scalar version costs less.
6033   SmallVector<Instruction *, 8> Worklist;
6034 
6035   // Returns true if the given instruction can be scalarized.
6036   auto canBeScalarized = [&](Instruction *I) -> bool {
6037     // We only attempt to scalarize instructions forming a single-use chain
6038     // from the original predicated block that would otherwise be vectorized.
6039     // Although not strictly necessary, we give up on instructions we know will
6040     // already be scalar to avoid traversing chains that are unlikely to be
6041     // beneficial.
6042     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6043         isScalarAfterVectorization(I, VF))
6044       return false;
6045 
6046     // If the instruction is scalar with predication, it will be analyzed
6047     // separately. We ignore it within the context of PredInst.
6048     if (isScalarWithPredication(I))
6049       return false;
6050 
6051     // If any of the instruction's operands are uniform after vectorization,
6052     // the instruction cannot be scalarized. This prevents, for example, a
6053     // masked load from being scalarized.
6054     //
6055     // We assume we will only emit a value for lane zero of an instruction
6056     // marked uniform after vectorization, rather than VF identical values.
6057     // Thus, if we scalarize an instruction that uses a uniform, we would
6058     // create uses of values corresponding to the lanes we aren't emitting code
6059     // for. This behavior can be changed by allowing getScalarValue to clone
6060     // the lane zero values for uniforms rather than asserting.
6061     for (Use &U : I->operands())
6062       if (auto *J = dyn_cast<Instruction>(U.get()))
6063         if (isUniformAfterVectorization(J, VF))
6064           return false;
6065 
6066     // Otherwise, we can scalarize the instruction.
6067     return true;
6068   };
6069 
6070   // Compute the expected cost discount from scalarizing the entire expression
6071   // feeding the predicated instruction. We currently only consider expressions
6072   // that are single-use instruction chains.
6073   Worklist.push_back(PredInst);
6074   while (!Worklist.empty()) {
6075     Instruction *I = Worklist.pop_back_val();
6076 
6077     // If we've already analyzed the instruction, there's nothing to do.
6078     if (ScalarCosts.find(I) != ScalarCosts.end())
6079       continue;
6080 
6081     // Compute the cost of the vector instruction. Note that this cost already
6082     // includes the scalarization overhead of the predicated instruction.
6083     unsigned VectorCost = getInstructionCost(I, VF).first;
6084 
6085     // Compute the cost of the scalarized instruction. This cost is the cost of
6086     // the instruction as if it wasn't if-converted and instead remained in the
6087     // predicated block. We will scale this cost by block probability after
6088     // computing the scalarization overhead.
6089     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6090     unsigned ScalarCost =
6091         VF.getKnownMinValue() *
6092         getInstructionCost(I, ElementCount::getFixed(1)).first;
6093 
6094     // Compute the scalarization overhead of needed insertelement instructions
6095     // and phi nodes.
6096     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6097       ScalarCost += TTI.getScalarizationOverhead(
6098           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6099           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6100       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6101       ScalarCost +=
6102           VF.getKnownMinValue() *
6103           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6104     }
6105 
6106     // Compute the scalarization overhead of needed extractelement
6107     // instructions. For each of the instruction's operands, if the operand can
6108     // be scalarized, add it to the worklist; otherwise, account for the
6109     // overhead.
6110     for (Use &U : I->operands())
6111       if (auto *J = dyn_cast<Instruction>(U.get())) {
6112         assert(VectorType::isValidElementType(J->getType()) &&
6113                "Instruction has non-scalar type");
6114         if (canBeScalarized(J))
6115           Worklist.push_back(J);
6116         else if (needsExtract(J, VF)) {
6117           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6118           ScalarCost += TTI.getScalarizationOverhead(
6119               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6120               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6121         }
6122       }
6123 
6124     // Scale the total scalar cost by block probability.
6125     ScalarCost /= getReciprocalPredBlockProb();
6126 
6127     // Compute the discount. A non-negative discount means the vector version
6128     // of the instruction costs more, and scalarizing would be beneficial.
6129     Discount += VectorCost - ScalarCost;
6130     ScalarCosts[I] = ScalarCost;
6131   }
6132 
6133   return Discount;
6134 }
6135 
6136 LoopVectorizationCostModel::VectorizationCostTy
6137 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6138   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6139   VectorizationCostTy Cost;
6140 
6141   // For each block.
6142   for (BasicBlock *BB : TheLoop->blocks()) {
6143     VectorizationCostTy BlockCost;
6144 
6145     // For each instruction in the old loop.
6146     for (Instruction &I : BB->instructionsWithoutDebug()) {
6147       // Skip ignored values.
6148       if (ValuesToIgnore.count(&I) ||
6149           (VF.isVector() && VecValuesToIgnore.count(&I)))
6150         continue;
6151 
6152       VectorizationCostTy C = getInstructionCost(&I, VF);
6153 
6154       // Check if we should override the cost.
6155       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6156         C.first = ForceTargetInstructionCost;
6157 
6158       BlockCost.first += C.first;
6159       BlockCost.second |= C.second;
6160       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6161                         << " for VF " << VF << " For instruction: " << I
6162                         << '\n');
6163     }
6164 
6165     // If we are vectorizing a predicated block, it will have been
6166     // if-converted. This means that the block's instructions (aside from
6167     // stores and instructions that may divide by zero) will now be
6168     // unconditionally executed. For the scalar case, we may not always execute
6169     // the predicated block. Thus, scale the block's cost by the probability of
6170     // executing it.
6171     if (VF.isScalar() && blockNeedsPredication(BB))
6172       BlockCost.first /= getReciprocalPredBlockProb();
6173 
6174     Cost.first += BlockCost.first;
6175     Cost.second |= BlockCost.second;
6176   }
6177 
6178   return Cost;
6179 }
6180 
6181 /// Gets Address Access SCEV after verifying that the access pattern
6182 /// is loop invariant except the induction variable dependence.
6183 ///
6184 /// This SCEV can be sent to the Target in order to estimate the address
6185 /// calculation cost.
6186 static const SCEV *getAddressAccessSCEV(
6187               Value *Ptr,
6188               LoopVectorizationLegality *Legal,
6189               PredicatedScalarEvolution &PSE,
6190               const Loop *TheLoop) {
6191 
6192   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6193   if (!Gep)
6194     return nullptr;
6195 
6196   // We are looking for a gep with all loop invariant indices except for one
6197   // which should be an induction variable.
6198   auto SE = PSE.getSE();
6199   unsigned NumOperands = Gep->getNumOperands();
6200   for (unsigned i = 1; i < NumOperands; ++i) {
6201     Value *Opd = Gep->getOperand(i);
6202     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6203         !Legal->isInductionVariable(Opd))
6204       return nullptr;
6205   }
6206 
6207   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6208   return PSE.getSCEV(Ptr);
6209 }
6210 
6211 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6212   return Legal->hasStride(I->getOperand(0)) ||
6213          Legal->hasStride(I->getOperand(1));
6214 }
6215 
6216 unsigned
6217 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6218                                                         ElementCount VF) {
6219   assert(VF.isVector() &&
6220          "Scalarization cost of instruction implies vectorization.");
6221   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6222   Type *ValTy = getMemInstValueType(I);
6223   auto SE = PSE.getSE();
6224 
6225   unsigned AS = getLoadStoreAddressSpace(I);
6226   Value *Ptr = getLoadStorePointerOperand(I);
6227   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6228 
6229   // Figure out whether the access is strided and get the stride value
6230   // if it's known in compile time
6231   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6232 
6233   // Get the cost of the scalar memory instruction and address computation.
6234   unsigned Cost =
6235       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6236 
6237   // Don't pass *I here, since it is scalar but will actually be part of a
6238   // vectorized loop where the user of it is a vectorized instruction.
6239   const Align Alignment = getLoadStoreAlignment(I);
6240   Cost += VF.getKnownMinValue() *
6241           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6242                               AS, TTI::TCK_RecipThroughput);
6243 
6244   // Get the overhead of the extractelement and insertelement instructions
6245   // we might create due to scalarization.
6246   Cost += getScalarizationOverhead(I, VF);
6247 
6248   // If we have a predicated store, it may not be executed for each vector
6249   // lane. Scale the cost by the probability of executing the predicated
6250   // block.
6251   if (isPredicatedInst(I)) {
6252     Cost /= getReciprocalPredBlockProb();
6253 
6254     if (useEmulatedMaskMemRefHack(I))
6255       // Artificially setting to a high enough value to practically disable
6256       // vectorization with such operations.
6257       Cost = 3000000;
6258   }
6259 
6260   return Cost;
6261 }
6262 
6263 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6264                                                              ElementCount VF) {
6265   Type *ValTy = getMemInstValueType(I);
6266   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6267   Value *Ptr = getLoadStorePointerOperand(I);
6268   unsigned AS = getLoadStoreAddressSpace(I);
6269   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6270   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6271 
6272   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6273          "Stride should be 1 or -1 for consecutive memory access");
6274   const Align Alignment = getLoadStoreAlignment(I);
6275   unsigned Cost = 0;
6276   if (Legal->isMaskRequired(I))
6277     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6278                                       CostKind);
6279   else
6280     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6281                                 CostKind, I);
6282 
6283   bool Reverse = ConsecutiveStride < 0;
6284   if (Reverse)
6285     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6286   return Cost;
6287 }
6288 
6289 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6290                                                          ElementCount VF) {
6291   assert(Legal->isUniformMemOp(*I));
6292 
6293   Type *ValTy = getMemInstValueType(I);
6294   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6295   const Align Alignment = getLoadStoreAlignment(I);
6296   unsigned AS = getLoadStoreAddressSpace(I);
6297   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6298   if (isa<LoadInst>(I)) {
6299     return TTI.getAddressComputationCost(ValTy) +
6300            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6301                                CostKind) +
6302            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6303   }
6304   StoreInst *SI = cast<StoreInst>(I);
6305 
6306   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6307   return TTI.getAddressComputationCost(ValTy) +
6308          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6309                              CostKind) +
6310          (isLoopInvariantStoreValue
6311               ? 0
6312               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6313                                        VF.getKnownMinValue() - 1));
6314 }
6315 
6316 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6317                                                           ElementCount VF) {
6318   Type *ValTy = getMemInstValueType(I);
6319   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6320   const Align Alignment = getLoadStoreAlignment(I);
6321   const Value *Ptr = getLoadStorePointerOperand(I);
6322 
6323   return TTI.getAddressComputationCost(VectorTy) +
6324          TTI.getGatherScatterOpCost(
6325              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6326              TargetTransformInfo::TCK_RecipThroughput, I);
6327 }
6328 
6329 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6330                                                             ElementCount VF) {
6331   Type *ValTy = getMemInstValueType(I);
6332   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6333   unsigned AS = getLoadStoreAddressSpace(I);
6334 
6335   auto Group = getInterleavedAccessGroup(I);
6336   assert(Group && "Fail to get an interleaved access group.");
6337 
6338   unsigned InterleaveFactor = Group->getFactor();
6339   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6340   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6341 
6342   // Holds the indices of existing members in an interleaved load group.
6343   // An interleaved store group doesn't need this as it doesn't allow gaps.
6344   SmallVector<unsigned, 4> Indices;
6345   if (isa<LoadInst>(I)) {
6346     for (unsigned i = 0; i < InterleaveFactor; i++)
6347       if (Group->getMember(i))
6348         Indices.push_back(i);
6349   }
6350 
6351   // Calculate the cost of the whole interleaved group.
6352   bool UseMaskForGaps =
6353       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6354   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6355       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6356       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6357 
6358   if (Group->isReverse()) {
6359     // TODO: Add support for reversed masked interleaved access.
6360     assert(!Legal->isMaskRequired(I) &&
6361            "Reverse masked interleaved access not supported.");
6362     Cost += Group->getNumMembers() *
6363             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6364   }
6365   return Cost;
6366 }
6367 
6368 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6369                                                               ElementCount VF) {
6370   // Calculate scalar cost only. Vectorization cost should be ready at this
6371   // moment.
6372   if (VF.isScalar()) {
6373     Type *ValTy = getMemInstValueType(I);
6374     const Align Alignment = getLoadStoreAlignment(I);
6375     unsigned AS = getLoadStoreAddressSpace(I);
6376 
6377     return TTI.getAddressComputationCost(ValTy) +
6378            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6379                                TTI::TCK_RecipThroughput, I);
6380   }
6381   return getWideningCost(I, VF);
6382 }
6383 
6384 LoopVectorizationCostModel::VectorizationCostTy
6385 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6386                                                ElementCount VF) {
6387   assert(!VF.isScalable() &&
6388          "the cost model is not yet implemented for scalable vectorization");
6389   // If we know that this instruction will remain uniform, check the cost of
6390   // the scalar version.
6391   if (isUniformAfterVectorization(I, VF))
6392     VF = ElementCount::getFixed(1);
6393 
6394   if (VF.isVector() && isProfitableToScalarize(I, VF))
6395     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6396 
6397   // Forced scalars do not have any scalarization overhead.
6398   auto ForcedScalar = ForcedScalars.find(VF);
6399   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6400     auto InstSet = ForcedScalar->second;
6401     if (InstSet.count(I))
6402       return VectorizationCostTy(
6403           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6404            VF.getKnownMinValue()),
6405           false);
6406   }
6407 
6408   Type *VectorTy;
6409   unsigned C = getInstructionCost(I, VF, VectorTy);
6410 
6411   bool TypeNotScalarized =
6412       VF.isVector() && VectorTy->isVectorTy() &&
6413       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6414   return VectorizationCostTy(C, TypeNotScalarized);
6415 }
6416 
6417 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6418                                                               ElementCount VF) {
6419 
6420   assert(!VF.isScalable() &&
6421          "cannot compute scalarization overhead for scalable vectorization");
6422   if (VF.isScalar())
6423     return 0;
6424 
6425   unsigned Cost = 0;
6426   Type *RetTy = ToVectorTy(I->getType(), VF);
6427   if (!RetTy->isVoidTy() &&
6428       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6429     Cost += TTI.getScalarizationOverhead(
6430         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6431         true, false);
6432 
6433   // Some targets keep addresses scalar.
6434   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6435     return Cost;
6436 
6437   // Some targets support efficient element stores.
6438   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6439     return Cost;
6440 
6441   // Collect operands to consider.
6442   CallInst *CI = dyn_cast<CallInst>(I);
6443   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6444 
6445   // Skip operands that do not require extraction/scalarization and do not incur
6446   // any overhead.
6447   return Cost + TTI.getOperandsScalarizationOverhead(
6448                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6449 }
6450 
6451 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6452   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6453   if (VF.isScalar())
6454     return;
6455   NumPredStores = 0;
6456   for (BasicBlock *BB : TheLoop->blocks()) {
6457     // For each instruction in the old loop.
6458     for (Instruction &I : *BB) {
6459       Value *Ptr =  getLoadStorePointerOperand(&I);
6460       if (!Ptr)
6461         continue;
6462 
6463       // TODO: We should generate better code and update the cost model for
6464       // predicated uniform stores. Today they are treated as any other
6465       // predicated store (see added test cases in
6466       // invariant-store-vectorization.ll).
6467       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6468         NumPredStores++;
6469 
6470       if (Legal->isUniformMemOp(I)) {
6471         // TODO: Avoid replicating loads and stores instead of
6472         // relying on instcombine to remove them.
6473         // Load: Scalar load + broadcast
6474         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6475         unsigned Cost = getUniformMemOpCost(&I, VF);
6476         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6477         continue;
6478       }
6479 
6480       // We assume that widening is the best solution when possible.
6481       if (memoryInstructionCanBeWidened(&I, VF)) {
6482         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6483         int ConsecutiveStride =
6484                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6485         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6486                "Expected consecutive stride.");
6487         InstWidening Decision =
6488             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6489         setWideningDecision(&I, VF, Decision, Cost);
6490         continue;
6491       }
6492 
6493       // Choose between Interleaving, Gather/Scatter or Scalarization.
6494       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6495       unsigned NumAccesses = 1;
6496       if (isAccessInterleaved(&I)) {
6497         auto Group = getInterleavedAccessGroup(&I);
6498         assert(Group && "Fail to get an interleaved access group.");
6499 
6500         // Make one decision for the whole group.
6501         if (getWideningDecision(&I, VF) != CM_Unknown)
6502           continue;
6503 
6504         NumAccesses = Group->getNumMembers();
6505         if (interleavedAccessCanBeWidened(&I, VF))
6506           InterleaveCost = getInterleaveGroupCost(&I, VF);
6507       }
6508 
6509       unsigned GatherScatterCost =
6510           isLegalGatherOrScatter(&I)
6511               ? getGatherScatterCost(&I, VF) * NumAccesses
6512               : std::numeric_limits<unsigned>::max();
6513 
6514       unsigned ScalarizationCost =
6515           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6516 
6517       // Choose better solution for the current VF,
6518       // write down this decision and use it during vectorization.
6519       unsigned Cost;
6520       InstWidening Decision;
6521       if (InterleaveCost <= GatherScatterCost &&
6522           InterleaveCost < ScalarizationCost) {
6523         Decision = CM_Interleave;
6524         Cost = InterleaveCost;
6525       } else if (GatherScatterCost < ScalarizationCost) {
6526         Decision = CM_GatherScatter;
6527         Cost = GatherScatterCost;
6528       } else {
6529         Decision = CM_Scalarize;
6530         Cost = ScalarizationCost;
6531       }
6532       // If the instructions belongs to an interleave group, the whole group
6533       // receives the same decision. The whole group receives the cost, but
6534       // the cost will actually be assigned to one instruction.
6535       if (auto Group = getInterleavedAccessGroup(&I))
6536         setWideningDecision(Group, VF, Decision, Cost);
6537       else
6538         setWideningDecision(&I, VF, Decision, Cost);
6539     }
6540   }
6541 
6542   // Make sure that any load of address and any other address computation
6543   // remains scalar unless there is gather/scatter support. This avoids
6544   // inevitable extracts into address registers, and also has the benefit of
6545   // activating LSR more, since that pass can't optimize vectorized
6546   // addresses.
6547   if (TTI.prefersVectorizedAddressing())
6548     return;
6549 
6550   // Start with all scalar pointer uses.
6551   SmallPtrSet<Instruction *, 8> AddrDefs;
6552   for (BasicBlock *BB : TheLoop->blocks())
6553     for (Instruction &I : *BB) {
6554       Instruction *PtrDef =
6555         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6556       if (PtrDef && TheLoop->contains(PtrDef) &&
6557           getWideningDecision(&I, VF) != CM_GatherScatter)
6558         AddrDefs.insert(PtrDef);
6559     }
6560 
6561   // Add all instructions used to generate the addresses.
6562   SmallVector<Instruction *, 4> Worklist;
6563   for (auto *I : AddrDefs)
6564     Worklist.push_back(I);
6565   while (!Worklist.empty()) {
6566     Instruction *I = Worklist.pop_back_val();
6567     for (auto &Op : I->operands())
6568       if (auto *InstOp = dyn_cast<Instruction>(Op))
6569         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6570             AddrDefs.insert(InstOp).second)
6571           Worklist.push_back(InstOp);
6572   }
6573 
6574   for (auto *I : AddrDefs) {
6575     if (isa<LoadInst>(I)) {
6576       // Setting the desired widening decision should ideally be handled in
6577       // by cost functions, but since this involves the task of finding out
6578       // if the loaded register is involved in an address computation, it is
6579       // instead changed here when we know this is the case.
6580       InstWidening Decision = getWideningDecision(I, VF);
6581       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6582         // Scalarize a widened load of address.
6583         setWideningDecision(
6584             I, VF, CM_Scalarize,
6585             (VF.getKnownMinValue() *
6586              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6587       else if (auto Group = getInterleavedAccessGroup(I)) {
6588         // Scalarize an interleave group of address loads.
6589         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6590           if (Instruction *Member = Group->getMember(I))
6591             setWideningDecision(
6592                 Member, VF, CM_Scalarize,
6593                 (VF.getKnownMinValue() *
6594                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6595         }
6596       }
6597     } else
6598       // Make sure I gets scalarized and a cost estimate without
6599       // scalarization overhead.
6600       ForcedScalars[VF].insert(I);
6601   }
6602 }
6603 
6604 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6605                                                         ElementCount VF,
6606                                                         Type *&VectorTy) {
6607   Type *RetTy = I->getType();
6608   if (canTruncateToMinimalBitwidth(I, VF))
6609     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6610   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6611   auto SE = PSE.getSE();
6612   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6613 
6614   // TODO: We need to estimate the cost of intrinsic calls.
6615   switch (I->getOpcode()) {
6616   case Instruction::GetElementPtr:
6617     // We mark this instruction as zero-cost because the cost of GEPs in
6618     // vectorized code depends on whether the corresponding memory instruction
6619     // is scalarized or not. Therefore, we handle GEPs with the memory
6620     // instruction cost.
6621     return 0;
6622   case Instruction::Br: {
6623     // In cases of scalarized and predicated instructions, there will be VF
6624     // predicated blocks in the vectorized loop. Each branch around these
6625     // blocks requires also an extract of its vector compare i1 element.
6626     bool ScalarPredicatedBB = false;
6627     BranchInst *BI = cast<BranchInst>(I);
6628     if (VF.isVector() && BI->isConditional() &&
6629         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6630          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6631       ScalarPredicatedBB = true;
6632 
6633     if (ScalarPredicatedBB) {
6634       // Return cost for branches around scalarized and predicated blocks.
6635       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6636       auto *Vec_i1Ty =
6637           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6638       return (TTI.getScalarizationOverhead(
6639                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6640                   false, true) +
6641               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
6642                VF.getKnownMinValue()));
6643     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6644       // The back-edge branch will remain, as will all scalar branches.
6645       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6646     else
6647       // This branch will be eliminated by if-conversion.
6648       return 0;
6649     // Note: We currently assume zero cost for an unconditional branch inside
6650     // a predicated block since it will become a fall-through, although we
6651     // may decide in the future to call TTI for all branches.
6652   }
6653   case Instruction::PHI: {
6654     auto *Phi = cast<PHINode>(I);
6655 
6656     // First-order recurrences are replaced by vector shuffles inside the loop.
6657     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6658     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6659       return TTI.getShuffleCost(
6660           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
6661           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
6662 
6663     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6664     // converted into select instructions. We require N - 1 selects per phi
6665     // node, where N is the number of incoming values.
6666     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6667       return (Phi->getNumIncomingValues() - 1) *
6668              TTI.getCmpSelInstrCost(
6669                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6670                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6671                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6672 
6673     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6674   }
6675   case Instruction::UDiv:
6676   case Instruction::SDiv:
6677   case Instruction::URem:
6678   case Instruction::SRem:
6679     // If we have a predicated instruction, it may not be executed for each
6680     // vector lane. Get the scalarization cost and scale this amount by the
6681     // probability of executing the predicated block. If the instruction is not
6682     // predicated, we fall through to the next case.
6683     if (VF.isVector() && isScalarWithPredication(I)) {
6684       unsigned Cost = 0;
6685 
6686       // These instructions have a non-void type, so account for the phi nodes
6687       // that we will create. This cost is likely to be zero. The phi node
6688       // cost, if any, should be scaled by the block probability because it
6689       // models a copy at the end of each predicated block.
6690       Cost += VF.getKnownMinValue() *
6691               TTI.getCFInstrCost(Instruction::PHI, CostKind);
6692 
6693       // The cost of the non-predicated instruction.
6694       Cost += VF.getKnownMinValue() *
6695               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6696 
6697       // The cost of insertelement and extractelement instructions needed for
6698       // scalarization.
6699       Cost += getScalarizationOverhead(I, VF);
6700 
6701       // Scale the cost by the probability of executing the predicated blocks.
6702       // This assumes the predicated block for each vector lane is equally
6703       // likely.
6704       return Cost / getReciprocalPredBlockProb();
6705     }
6706     LLVM_FALLTHROUGH;
6707   case Instruction::Add:
6708   case Instruction::FAdd:
6709   case Instruction::Sub:
6710   case Instruction::FSub:
6711   case Instruction::Mul:
6712   case Instruction::FMul:
6713   case Instruction::FDiv:
6714   case Instruction::FRem:
6715   case Instruction::Shl:
6716   case Instruction::LShr:
6717   case Instruction::AShr:
6718   case Instruction::And:
6719   case Instruction::Or:
6720   case Instruction::Xor: {
6721     // Since we will replace the stride by 1 the multiplication should go away.
6722     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6723       return 0;
6724     // Certain instructions can be cheaper to vectorize if they have a constant
6725     // second vector operand. One example of this are shifts on x86.
6726     Value *Op2 = I->getOperand(1);
6727     TargetTransformInfo::OperandValueProperties Op2VP;
6728     TargetTransformInfo::OperandValueKind Op2VK =
6729         TTI.getOperandInfo(Op2, Op2VP);
6730     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6731       Op2VK = TargetTransformInfo::OK_UniformValue;
6732 
6733     SmallVector<const Value *, 4> Operands(I->operand_values());
6734     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6735     return N * TTI.getArithmeticInstrCost(
6736                    I->getOpcode(), VectorTy, CostKind,
6737                    TargetTransformInfo::OK_AnyValue,
6738                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6739   }
6740   case Instruction::FNeg: {
6741     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
6742     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6743     return N * TTI.getArithmeticInstrCost(
6744                    I->getOpcode(), VectorTy, CostKind,
6745                    TargetTransformInfo::OK_AnyValue,
6746                    TargetTransformInfo::OK_AnyValue,
6747                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6748                    I->getOperand(0), I);
6749   }
6750   case Instruction::Select: {
6751     SelectInst *SI = cast<SelectInst>(I);
6752     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6753     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6754     Type *CondTy = SI->getCondition()->getType();
6755     if (!ScalarCond) {
6756       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
6757       CondTy = VectorType::get(CondTy, VF);
6758     }
6759     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6760                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
6761   }
6762   case Instruction::ICmp:
6763   case Instruction::FCmp: {
6764     Type *ValTy = I->getOperand(0)->getType();
6765     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6766     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6767       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6768     VectorTy = ToVectorTy(ValTy, VF);
6769     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
6770                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
6771   }
6772   case Instruction::Store:
6773   case Instruction::Load: {
6774     ElementCount Width = VF;
6775     if (Width.isVector()) {
6776       InstWidening Decision = getWideningDecision(I, Width);
6777       assert(Decision != CM_Unknown &&
6778              "CM decision should be taken at this point");
6779       if (Decision == CM_Scalarize)
6780         Width = ElementCount::getFixed(1);
6781     }
6782     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6783     return getMemoryInstructionCost(I, VF);
6784   }
6785   case Instruction::ZExt:
6786   case Instruction::SExt:
6787   case Instruction::FPToUI:
6788   case Instruction::FPToSI:
6789   case Instruction::FPExt:
6790   case Instruction::PtrToInt:
6791   case Instruction::IntToPtr:
6792   case Instruction::SIToFP:
6793   case Instruction::UIToFP:
6794   case Instruction::Trunc:
6795   case Instruction::FPTrunc:
6796   case Instruction::BitCast: {
6797     // Computes the CastContextHint from a Load/Store instruction.
6798     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6799       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6800              "Expected a load or a store!");
6801 
6802       if (VF.isScalar() || !TheLoop->contains(I))
6803         return TTI::CastContextHint::Normal;
6804 
6805       switch (getWideningDecision(I, VF)) {
6806       case LoopVectorizationCostModel::CM_GatherScatter:
6807         return TTI::CastContextHint::GatherScatter;
6808       case LoopVectorizationCostModel::CM_Interleave:
6809         return TTI::CastContextHint::Interleave;
6810       case LoopVectorizationCostModel::CM_Scalarize:
6811       case LoopVectorizationCostModel::CM_Widen:
6812         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6813                                         : TTI::CastContextHint::Normal;
6814       case LoopVectorizationCostModel::CM_Widen_Reverse:
6815         return TTI::CastContextHint::Reversed;
6816       case LoopVectorizationCostModel::CM_Unknown:
6817         llvm_unreachable("Instr did not go through cost modelling?");
6818       }
6819 
6820       llvm_unreachable("Unhandled case!");
6821     };
6822 
6823     unsigned Opcode = I->getOpcode();
6824     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6825     // For Trunc, the context is the only user, which must be a StoreInst.
6826     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6827       if (I->hasOneUse())
6828         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6829           CCH = ComputeCCH(Store);
6830     }
6831     // For Z/Sext, the context is the operand, which must be a LoadInst.
6832     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6833              Opcode == Instruction::FPExt) {
6834       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6835         CCH = ComputeCCH(Load);
6836     }
6837 
6838     // We optimize the truncation of induction variables having constant
6839     // integer steps. The cost of these truncations is the same as the scalar
6840     // operation.
6841     if (isOptimizableIVTruncate(I, VF)) {
6842       auto *Trunc = cast<TruncInst>(I);
6843       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6844                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6845     }
6846 
6847     Type *SrcScalarTy = I->getOperand(0)->getType();
6848     Type *SrcVecTy =
6849         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6850     if (canTruncateToMinimalBitwidth(I, VF)) {
6851       // This cast is going to be shrunk. This may remove the cast or it might
6852       // turn it into slightly different cast. For example, if MinBW == 16,
6853       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6854       //
6855       // Calculate the modified src and dest types.
6856       Type *MinVecTy = VectorTy;
6857       if (Opcode == Instruction::Trunc) {
6858         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6859         VectorTy =
6860             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6861       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
6862         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6863         VectorTy =
6864             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6865       }
6866     }
6867 
6868     assert(!VF.isScalable() && "VF is assumed to be non scalable");
6869     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6870     return N *
6871            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6872   }
6873   case Instruction::Call: {
6874     bool NeedToScalarize;
6875     CallInst *CI = cast<CallInst>(I);
6876     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6877     if (getVectorIntrinsicIDForCall(CI, TLI))
6878       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6879     return CallCost;
6880   }
6881   case Instruction::ExtractValue:
6882     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
6883   default:
6884     // The cost of executing VF copies of the scalar instruction. This opcode
6885     // is unknown. Assume that it is the same as 'mul'.
6886     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
6887                                        Instruction::Mul, VectorTy, CostKind) +
6888            getScalarizationOverhead(I, VF);
6889   } // end of switch.
6890 }
6891 
6892 char LoopVectorize::ID = 0;
6893 
6894 static const char lv_name[] = "Loop Vectorization";
6895 
6896 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6897 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6898 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6899 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6900 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6901 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6902 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6903 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6904 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6905 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6906 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6907 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6908 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6909 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6910 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6911 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6912 
6913 namespace llvm {
6914 
6915 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6916 
6917 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6918                               bool VectorizeOnlyWhenForced) {
6919   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6920 }
6921 
6922 } // end namespace llvm
6923 
6924 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6925   // Check if the pointer operand of a load or store instruction is
6926   // consecutive.
6927   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6928     return Legal->isConsecutivePtr(Ptr);
6929   return false;
6930 }
6931 
6932 void LoopVectorizationCostModel::collectValuesToIgnore() {
6933   // Ignore ephemeral values.
6934   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6935 
6936   // Ignore type-promoting instructions we identified during reduction
6937   // detection.
6938   for (auto &Reduction : Legal->getReductionVars()) {
6939     RecurrenceDescriptor &RedDes = Reduction.second;
6940     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6941     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6942   }
6943   // Ignore type-casting instructions we identified during induction
6944   // detection.
6945   for (auto &Induction : Legal->getInductionVars()) {
6946     InductionDescriptor &IndDes = Induction.second;
6947     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6948     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6949   }
6950 }
6951 
6952 void LoopVectorizationCostModel::collectInLoopReductions() {
6953   for (auto &Reduction : Legal->getReductionVars()) {
6954     PHINode *Phi = Reduction.first;
6955     RecurrenceDescriptor &RdxDesc = Reduction.second;
6956 
6957     // We don't collect reductions that are type promoted (yet).
6958     if (RdxDesc.getRecurrenceType() != Phi->getType())
6959       continue;
6960 
6961     // If the target would prefer this reduction to happen "in-loop", then we
6962     // want to record it as such.
6963     unsigned Opcode = RdxDesc.getRecurrenceBinOp();
6964     if (!PreferInLoopReductions &&
6965         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
6966                                    TargetTransformInfo::ReductionFlags()))
6967       continue;
6968 
6969     // Check that we can correctly put the reductions into the loop, by
6970     // finding the chain of operations that leads from the phi to the loop
6971     // exit value.
6972     SmallVector<Instruction *, 4> ReductionOperations =
6973         RdxDesc.getReductionOpChain(Phi, TheLoop);
6974     bool InLoop = !ReductionOperations.empty();
6975     if (InLoop)
6976       InLoopReductionChains[Phi] = ReductionOperations;
6977     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6978                       << " reduction for phi: " << *Phi << "\n");
6979   }
6980 }
6981 
6982 // TODO: we could return a pair of values that specify the max VF and
6983 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6984 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6985 // doesn't have a cost model that can choose which plan to execute if
6986 // more than one is generated.
6987 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6988                                  LoopVectorizationCostModel &CM) {
6989   unsigned WidestType;
6990   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6991   return WidestVectorRegBits / WidestType;
6992 }
6993 
6994 VectorizationFactor
6995 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6996   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
6997   ElementCount VF = UserVF;
6998   // Outer loop handling: They may require CFG and instruction level
6999   // transformations before even evaluating whether vectorization is profitable.
7000   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7001   // the vectorization pipeline.
7002   if (!OrigLoop->isInnermost()) {
7003     // If the user doesn't provide a vectorization factor, determine a
7004     // reasonable one.
7005     if (UserVF.isZero()) {
7006       VF = ElementCount::getFixed(
7007           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
7008       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7009 
7010       // Make sure we have a VF > 1 for stress testing.
7011       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7012         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7013                           << "overriding computed VF.\n");
7014         VF = ElementCount::getFixed(4);
7015       }
7016     }
7017     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7018     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7019            "VF needs to be a power of two");
7020     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7021                       << "VF " << VF << " to build VPlans.\n");
7022     buildVPlans(VF, VF);
7023 
7024     // For VPlan build stress testing, we bail out after VPlan construction.
7025     if (VPlanBuildStressTest)
7026       return VectorizationFactor::Disabled();
7027 
7028     return {VF, 0 /*Cost*/};
7029   }
7030 
7031   LLVM_DEBUG(
7032       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7033                 "VPlan-native path.\n");
7034   return VectorizationFactor::Disabled();
7035 }
7036 
7037 Optional<VectorizationFactor>
7038 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7039   assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
7040   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7041   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7042   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7043     return None;
7044 
7045   // Invalidate interleave groups if all blocks of loop will be predicated.
7046   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7047       !useMaskedInterleavedAccesses(*TTI)) {
7048     LLVM_DEBUG(
7049         dbgs()
7050         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7051            "which requires masked-interleaved support.\n");
7052     if (CM.InterleaveInfo.invalidateGroups())
7053       // Invalidating interleave groups also requires invalidating all decisions
7054       // based on them, which includes widening decisions and uniform and scalar
7055       // values.
7056       CM.invalidateCostModelingDecisions();
7057   }
7058 
7059   ElementCount MaxVF = MaybeMaxVF.getValue();
7060   assert(MaxVF.isNonZero() && "MaxVF is zero.");
7061 
7062   if (!UserVF.isZero() && UserVF.getFixedValue() <= MaxVF.getFixedValue()) {
7063     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7064     assert(isPowerOf2_32(UserVF.getFixedValue()) &&
7065            "VF needs to be a power of two");
7066     // Collect the instructions (and their associated costs) that will be more
7067     // profitable to scalarize.
7068     CM.selectUserVectorizationFactor(UserVF);
7069     CM.collectInLoopReductions();
7070     buildVPlansWithVPRecipes(UserVF, UserVF);
7071     LLVM_DEBUG(printPlans(dbgs()));
7072     return {{UserVF, 0}};
7073   }
7074 
7075   for (ElementCount VF = ElementCount::getFixed(1);
7076        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7077     // Collect Uniform and Scalar instructions after vectorization with VF.
7078     CM.collectUniformsAndScalars(VF);
7079 
7080     // Collect the instructions (and their associated costs) that will be more
7081     // profitable to scalarize.
7082     if (VF.isVector())
7083       CM.collectInstsToScalarize(VF);
7084   }
7085 
7086   CM.collectInLoopReductions();
7087 
7088   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7089   LLVM_DEBUG(printPlans(dbgs()));
7090   if (MaxVF.isScalar())
7091     return VectorizationFactor::Disabled();
7092 
7093   // Select the optimal vectorization factor.
7094   return CM.selectVectorizationFactor(MaxVF);
7095 }
7096 
7097 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7098   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7099                     << '\n');
7100   BestVF = VF;
7101   BestUF = UF;
7102 
7103   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7104     return !Plan->hasVF(VF);
7105   });
7106   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7107 }
7108 
7109 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7110                                            DominatorTree *DT) {
7111   // Perform the actual loop transformation.
7112 
7113   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7114   VPCallbackILV CallbackILV(ILV);
7115 
7116   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7117 
7118   VPTransformState State{*BestVF, BestUF,      LI,
7119                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7120                          &ILV,    CallbackILV};
7121   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7122   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7123   State.CanonicalIV = ILV.Induction;
7124 
7125   //===------------------------------------------------===//
7126   //
7127   // Notice: any optimization or new instruction that go
7128   // into the code below should also be implemented in
7129   // the cost-model.
7130   //
7131   //===------------------------------------------------===//
7132 
7133   // 2. Copy and widen instructions from the old loop into the new loop.
7134   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7135   VPlans.front()->execute(&State);
7136 
7137   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7138   //    predication, updating analyses.
7139   ILV.fixVectorizedLoop();
7140 }
7141 
7142 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7143     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7144   BasicBlock *Latch = OrigLoop->getLoopLatch();
7145 
7146   // We create new control-flow for the vectorized loop, so the original
7147   // condition will be dead after vectorization if it's only used by the
7148   // branch.
7149   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
7150   if (Cmp && Cmp->hasOneUse()) {
7151     DeadInstructions.insert(Cmp);
7152 
7153     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7154     for (Value *Op : Cmp->operands()) {
7155       if (isa<TruncInst>(Op) && Op->hasOneUse())
7156           DeadInstructions.insert(cast<Instruction>(Op));
7157     }
7158   }
7159 
7160   // We create new "steps" for induction variable updates to which the original
7161   // induction variables map. An original update instruction will be dead if
7162   // all its users except the induction variable are dead.
7163   for (auto &Induction : Legal->getInductionVars()) {
7164     PHINode *Ind = Induction.first;
7165     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7166 
7167     // If the tail is to be folded by masking, the primary induction variable,
7168     // if exists, isn't dead: it will be used for masking. Don't kill it.
7169     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7170       continue;
7171 
7172     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7173           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7174         }))
7175       DeadInstructions.insert(IndUpdate);
7176 
7177     // We record as "Dead" also the type-casting instructions we had identified
7178     // during induction analysis. We don't need any handling for them in the
7179     // vectorized loop because we have proven that, under a proper runtime
7180     // test guarding the vectorized loop, the value of the phi, and the casted
7181     // value of the phi, are the same. The last instruction in this casting chain
7182     // will get its scalar/vector/widened def from the scalar/vector/widened def
7183     // of the respective phi node. Any other casts in the induction def-use chain
7184     // have no other uses outside the phi update chain, and will be ignored.
7185     InductionDescriptor &IndDes = Induction.second;
7186     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7187     DeadInstructions.insert(Casts.begin(), Casts.end());
7188   }
7189 }
7190 
7191 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7192 
7193 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7194 
7195 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7196                                         Instruction::BinaryOps BinOp) {
7197   // When unrolling and the VF is 1, we only need to add a simple scalar.
7198   Type *Ty = Val->getType();
7199   assert(!Ty->isVectorTy() && "Val must be a scalar");
7200 
7201   if (Ty->isFloatingPointTy()) {
7202     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7203 
7204     // Floating point operations had to be 'fast' to enable the unrolling.
7205     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7206     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7207   }
7208   Constant *C = ConstantInt::get(Ty, StartIdx);
7209   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7210 }
7211 
7212 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7213   SmallVector<Metadata *, 4> MDs;
7214   // Reserve first location for self reference to the LoopID metadata node.
7215   MDs.push_back(nullptr);
7216   bool IsUnrollMetadata = false;
7217   MDNode *LoopID = L->getLoopID();
7218   if (LoopID) {
7219     // First find existing loop unrolling disable metadata.
7220     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7221       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7222       if (MD) {
7223         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7224         IsUnrollMetadata =
7225             S && S->getString().startswith("llvm.loop.unroll.disable");
7226       }
7227       MDs.push_back(LoopID->getOperand(i));
7228     }
7229   }
7230 
7231   if (!IsUnrollMetadata) {
7232     // Add runtime unroll disable metadata.
7233     LLVMContext &Context = L->getHeader()->getContext();
7234     SmallVector<Metadata *, 1> DisableOperands;
7235     DisableOperands.push_back(
7236         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7237     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7238     MDs.push_back(DisableNode);
7239     MDNode *NewLoopID = MDNode::get(Context, MDs);
7240     // Set operand 0 to refer to the loop id itself.
7241     NewLoopID->replaceOperandWith(0, NewLoopID);
7242     L->setLoopID(NewLoopID);
7243   }
7244 }
7245 
7246 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7247     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7248   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7249   bool PredicateAtRangeStart = Predicate(Range.Start);
7250 
7251   for (ElementCount TmpVF = Range.Start * 2;
7252        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7253     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7254       Range.End = TmpVF;
7255       break;
7256     }
7257 
7258   return PredicateAtRangeStart;
7259 }
7260 
7261 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7262 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7263 /// of VF's starting at a given VF and extending it as much as possible. Each
7264 /// vectorization decision can potentially shorten this sub-range during
7265 /// buildVPlan().
7266 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7267                                            ElementCount MaxVF) {
7268   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7269   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7270     VFRange SubRange = {VF, MaxVFPlusOne};
7271     VPlans.push_back(buildVPlan(SubRange));
7272     VF = SubRange.End;
7273   }
7274 }
7275 
7276 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7277                                          VPlanPtr &Plan) {
7278   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7279 
7280   // Look for cached value.
7281   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7282   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7283   if (ECEntryIt != EdgeMaskCache.end())
7284     return ECEntryIt->second;
7285 
7286   VPValue *SrcMask = createBlockInMask(Src, Plan);
7287 
7288   // The terminator has to be a branch inst!
7289   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7290   assert(BI && "Unexpected terminator found");
7291 
7292   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7293     return EdgeMaskCache[Edge] = SrcMask;
7294 
7295   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
7296   assert(EdgeMask && "No Edge Mask found for condition");
7297 
7298   if (BI->getSuccessor(0) != Dst)
7299     EdgeMask = Builder.createNot(EdgeMask);
7300 
7301   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7302     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7303 
7304   return EdgeMaskCache[Edge] = EdgeMask;
7305 }
7306 
7307 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7308   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7309 
7310   // Look for cached value.
7311   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7312   if (BCEntryIt != BlockMaskCache.end())
7313     return BCEntryIt->second;
7314 
7315   // All-one mask is modelled as no-mask following the convention for masked
7316   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7317   VPValue *BlockMask = nullptr;
7318 
7319   if (OrigLoop->getHeader() == BB) {
7320     if (!CM.blockNeedsPredication(BB))
7321       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7322 
7323     // Create the block in mask as the first non-phi instruction in the block.
7324     VPBuilder::InsertPointGuard Guard(Builder);
7325     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
7326     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
7327 
7328     // Introduce the early-exit compare IV <= BTC to form header block mask.
7329     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7330     // Start by constructing the desired canonical IV.
7331     VPValue *IV = nullptr;
7332     if (Legal->getPrimaryInduction())
7333       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
7334     else {
7335       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7336       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
7337       IV = IVRecipe->getVPValue();
7338     }
7339     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7340     bool TailFolded = !CM.isScalarEpilogueAllowed();
7341 
7342     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7343       // While ActiveLaneMask is a binary op that consumes the loop tripcount
7344       // as a second argument, we only pass the IV here and extract the
7345       // tripcount from the transform state where codegen of the VP instructions
7346       // happen.
7347       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7348     } else {
7349       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7350     }
7351     return BlockMaskCache[BB] = BlockMask;
7352   }
7353 
7354   // This is the block mask. We OR all incoming edges.
7355   for (auto *Predecessor : predecessors(BB)) {
7356     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7357     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7358       return BlockMaskCache[BB] = EdgeMask;
7359 
7360     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7361       BlockMask = EdgeMask;
7362       continue;
7363     }
7364 
7365     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7366   }
7367 
7368   return BlockMaskCache[BB] = BlockMask;
7369 }
7370 
7371 VPWidenMemoryInstructionRecipe *
7372 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7373                                   VPlanPtr &Plan) {
7374   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7375          "Must be called with either a load or store");
7376 
7377   auto willWiden = [&](ElementCount VF) -> bool {
7378     assert(!VF.isScalable() && "unexpected scalable ElementCount");
7379     if (VF.isScalar())
7380       return false;
7381     LoopVectorizationCostModel::InstWidening Decision =
7382         CM.getWideningDecision(I, VF);
7383     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7384            "CM decision should be taken at this point.");
7385     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7386       return true;
7387     if (CM.isScalarAfterVectorization(I, VF) ||
7388         CM.isProfitableToScalarize(I, VF))
7389       return false;
7390     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7391   };
7392 
7393   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7394     return nullptr;
7395 
7396   VPValue *Mask = nullptr;
7397   if (Legal->isMaskRequired(I))
7398     Mask = createBlockInMask(I->getParent(), Plan);
7399 
7400   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7401   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7402     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7403 
7404   StoreInst *Store = cast<StoreInst>(I);
7405   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7406   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7407 }
7408 
7409 VPWidenIntOrFpInductionRecipe *
7410 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
7411   // Check if this is an integer or fp induction. If so, build the recipe that
7412   // produces its scalar and vector values.
7413   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
7414   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
7415       II.getKind() == InductionDescriptor::IK_FpInduction)
7416     return new VPWidenIntOrFpInductionRecipe(Phi);
7417 
7418   return nullptr;
7419 }
7420 
7421 VPWidenIntOrFpInductionRecipe *
7422 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
7423                                                 VFRange &Range) const {
7424   // Optimize the special case where the source is a constant integer
7425   // induction variable. Notice that we can only optimize the 'trunc' case
7426   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7427   // (c) other casts depend on pointer size.
7428 
7429   // Determine whether \p K is a truncation based on an induction variable that
7430   // can be optimized.
7431   auto isOptimizableIVTruncate =
7432       [&](Instruction *K) -> std::function<bool(ElementCount)> {
7433     return [=](ElementCount VF) -> bool {
7434       return CM.isOptimizableIVTruncate(K, VF);
7435     };
7436   };
7437 
7438   if (LoopVectorizationPlanner::getDecisionAndClampRange(
7439           isOptimizableIVTruncate(I), Range))
7440     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
7441                                              I);
7442   return nullptr;
7443 }
7444 
7445 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
7446   // We know that all PHIs in non-header blocks are converted into selects, so
7447   // we don't have to worry about the insertion order and we can just use the
7448   // builder. At this point we generate the predication tree. There may be
7449   // duplications since this is a simple recursive scan, but future
7450   // optimizations will clean it up.
7451 
7452   SmallVector<VPValue *, 2> Operands;
7453   unsigned NumIncoming = Phi->getNumIncomingValues();
7454   for (unsigned In = 0; In < NumIncoming; In++) {
7455     VPValue *EdgeMask =
7456       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
7457     assert((EdgeMask || NumIncoming == 1) &&
7458            "Multiple predecessors with one having a full mask");
7459     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
7460     if (EdgeMask)
7461       Operands.push_back(EdgeMask);
7462   }
7463   return new VPBlendRecipe(Phi, Operands);
7464 }
7465 
7466 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
7467                                                    VPlan &Plan) const {
7468 
7469   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7470       [this, CI](ElementCount VF) {
7471         return CM.isScalarWithPredication(CI, VF);
7472       },
7473       Range);
7474 
7475   if (IsPredicated)
7476     return nullptr;
7477 
7478   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7479   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7480              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7481              ID == Intrinsic::pseudoprobe))
7482     return nullptr;
7483 
7484   auto willWiden = [&](ElementCount VF) -> bool {
7485     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7486     // The following case may be scalarized depending on the VF.
7487     // The flag shows whether we use Intrinsic or a usual Call for vectorized
7488     // version of the instruction.
7489     // Is it beneficial to perform intrinsic call compared to lib call?
7490     bool NeedToScalarize = false;
7491     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
7492     bool UseVectorIntrinsic =
7493         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
7494     return UseVectorIntrinsic || !NeedToScalarize;
7495   };
7496 
7497   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7498     return nullptr;
7499 
7500   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
7501 }
7502 
7503 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7504   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7505          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7506   // Instruction should be widened, unless it is scalar after vectorization,
7507   // scalarization is profitable or it is predicated.
7508   auto WillScalarize = [this, I](ElementCount VF) -> bool {
7509     return CM.isScalarAfterVectorization(I, VF) ||
7510            CM.isProfitableToScalarize(I, VF) ||
7511            CM.isScalarWithPredication(I, VF);
7512   };
7513   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7514                                                              Range);
7515 }
7516 
7517 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
7518   auto IsVectorizableOpcode = [](unsigned Opcode) {
7519     switch (Opcode) {
7520     case Instruction::Add:
7521     case Instruction::And:
7522     case Instruction::AShr:
7523     case Instruction::BitCast:
7524     case Instruction::FAdd:
7525     case Instruction::FCmp:
7526     case Instruction::FDiv:
7527     case Instruction::FMul:
7528     case Instruction::FNeg:
7529     case Instruction::FPExt:
7530     case Instruction::FPToSI:
7531     case Instruction::FPToUI:
7532     case Instruction::FPTrunc:
7533     case Instruction::FRem:
7534     case Instruction::FSub:
7535     case Instruction::ICmp:
7536     case Instruction::IntToPtr:
7537     case Instruction::LShr:
7538     case Instruction::Mul:
7539     case Instruction::Or:
7540     case Instruction::PtrToInt:
7541     case Instruction::SDiv:
7542     case Instruction::Select:
7543     case Instruction::SExt:
7544     case Instruction::Shl:
7545     case Instruction::SIToFP:
7546     case Instruction::SRem:
7547     case Instruction::Sub:
7548     case Instruction::Trunc:
7549     case Instruction::UDiv:
7550     case Instruction::UIToFP:
7551     case Instruction::URem:
7552     case Instruction::Xor:
7553     case Instruction::ZExt:
7554       return true;
7555     }
7556     return false;
7557   };
7558 
7559   if (!IsVectorizableOpcode(I->getOpcode()))
7560     return nullptr;
7561 
7562   // Success: widen this instruction.
7563   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7564 }
7565 
7566 VPBasicBlock *VPRecipeBuilder::handleReplication(
7567     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7568     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7569     VPlanPtr &Plan) {
7570   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7571       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7572       Range);
7573 
7574   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7575       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
7576       Range);
7577 
7578   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
7579                                        IsUniform, IsPredicated);
7580   setRecipe(I, Recipe);
7581   Plan->addVPValue(I, Recipe);
7582 
7583   // Find if I uses a predicated instruction. If so, it will use its scalar
7584   // value. Avoid hoisting the insert-element which packs the scalar value into
7585   // a vector value, as that happens iff all users use the vector value.
7586   for (auto &Op : I->operands())
7587     if (auto *PredInst = dyn_cast<Instruction>(Op))
7588       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7589         PredInst2Recipe[PredInst]->setAlsoPack(false);
7590 
7591   // Finalize the recipe for Instr, first if it is not predicated.
7592   if (!IsPredicated) {
7593     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7594     VPBB->appendRecipe(Recipe);
7595     return VPBB;
7596   }
7597   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7598   assert(VPBB->getSuccessors().empty() &&
7599          "VPBB has successors when handling predicated replication.");
7600   // Record predicated instructions for above packing optimizations.
7601   PredInst2Recipe[I] = Recipe;
7602   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7603   VPBlockUtils::insertBlockAfter(Region, VPBB);
7604   auto *RegSucc = new VPBasicBlock();
7605   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7606   return RegSucc;
7607 }
7608 
7609 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7610                                                       VPRecipeBase *PredRecipe,
7611                                                       VPlanPtr &Plan) {
7612   // Instructions marked for predication are replicated and placed under an
7613   // if-then construct to prevent side-effects.
7614 
7615   // Generate recipes to compute the block mask for this region.
7616   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7617 
7618   // Build the triangular if-then region.
7619   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7620   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7621   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7622   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7623   auto *PHIRecipe = Instr->getType()->isVoidTy()
7624                         ? nullptr
7625                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
7626   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7627   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7628   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7629 
7630   // Note: first set Entry as region entry and then connect successors starting
7631   // from it in order, to propagate the "parent" of each VPBasicBlock.
7632   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7633   VPBlockUtils::connectBlocks(Pred, Exit);
7634 
7635   return Region;
7636 }
7637 
7638 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7639                                                       VFRange &Range,
7640                                                       VPlanPtr &Plan) {
7641   // First, check for specific widening recipes that deal with calls, memory
7642   // operations, inductions and Phi nodes.
7643   if (auto *CI = dyn_cast<CallInst>(Instr))
7644     return tryToWidenCall(CI, Range, *Plan);
7645 
7646   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7647     return tryToWidenMemory(Instr, Range, Plan);
7648 
7649   VPRecipeBase *Recipe;
7650   if (auto Phi = dyn_cast<PHINode>(Instr)) {
7651     if (Phi->getParent() != OrigLoop->getHeader())
7652       return tryToBlend(Phi, Plan);
7653     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7654       return Recipe;
7655     return new VPWidenPHIRecipe(Phi);
7656   }
7657 
7658   if (isa<TruncInst>(Instr) &&
7659       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7660     return Recipe;
7661 
7662   if (!shouldWiden(Instr, Range))
7663     return nullptr;
7664 
7665   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7666     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
7667                                 OrigLoop);
7668 
7669   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7670     bool InvariantCond =
7671         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7672     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
7673                                    InvariantCond);
7674   }
7675 
7676   return tryToWiden(Instr, *Plan);
7677 }
7678 
7679 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
7680                                                         ElementCount MaxVF) {
7681   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7682 
7683   // Collect instructions from the original loop that will become trivially dead
7684   // in the vectorized loop. We don't need to vectorize these instructions. For
7685   // example, original induction update instructions can become dead because we
7686   // separately emit induction "steps" when generating code for the new loop.
7687   // Similarly, we create a new latch condition when setting up the structure
7688   // of the new loop, so the old one can become dead.
7689   SmallPtrSet<Instruction *, 4> DeadInstructions;
7690   collectTriviallyDeadInstructions(DeadInstructions);
7691 
7692   // Add assume instructions we need to drop to DeadInstructions, to prevent
7693   // them from being added to the VPlan.
7694   // TODO: We only need to drop assumes in blocks that get flattend. If the
7695   // control flow is preserved, we should keep them.
7696   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7697   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7698 
7699   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7700   // Dead instructions do not need sinking. Remove them from SinkAfter.
7701   for (Instruction *I : DeadInstructions)
7702     SinkAfter.erase(I);
7703 
7704   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7705   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7706     VFRange SubRange = {VF, MaxVFPlusOne};
7707     VPlans.push_back(
7708         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
7709     VF = SubRange.End;
7710   }
7711 }
7712 
7713 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7714     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
7715     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7716 
7717   // Hold a mapping from predicated instructions to their recipes, in order to
7718   // fix their AlsoPack behavior if a user is determined to replicate and use a
7719   // scalar instead of vector value.
7720   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7721 
7722   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7723 
7724   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7725 
7726   // ---------------------------------------------------------------------------
7727   // Pre-construction: record ingredients whose recipes we'll need to further
7728   // process after constructing the initial VPlan.
7729   // ---------------------------------------------------------------------------
7730 
7731   // Mark instructions we'll need to sink later and their targets as
7732   // ingredients whose recipe we'll need to record.
7733   for (auto &Entry : SinkAfter) {
7734     RecipeBuilder.recordRecipeOf(Entry.first);
7735     RecipeBuilder.recordRecipeOf(Entry.second);
7736   }
7737   for (auto &Reduction : CM.getInLoopReductionChains()) {
7738     PHINode *Phi = Reduction.first;
7739     RecurrenceDescriptor::RecurrenceKind Kind =
7740         Legal->getReductionVars()[Phi].getRecurrenceKind();
7741     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7742 
7743     RecipeBuilder.recordRecipeOf(Phi);
7744     for (auto &R : ReductionOperations) {
7745       RecipeBuilder.recordRecipeOf(R);
7746       // For min/max reducitons, where we have a pair of icmp/select, we also
7747       // need to record the ICmp recipe, so it can be removed later.
7748       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7749           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7750         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
7751       }
7752     }
7753   }
7754 
7755   // For each interleave group which is relevant for this (possibly trimmed)
7756   // Range, add it to the set of groups to be later applied to the VPlan and add
7757   // placeholders for its members' Recipes which we'll be replacing with a
7758   // single VPInterleaveRecipe.
7759   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7760     auto applyIG = [IG, this](ElementCount VF) -> bool {
7761       return (VF.isVector() && // Query is illegal for VF == 1
7762               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7763                   LoopVectorizationCostModel::CM_Interleave);
7764     };
7765     if (!getDecisionAndClampRange(applyIG, Range))
7766       continue;
7767     InterleaveGroups.insert(IG);
7768     for (unsigned i = 0; i < IG->getFactor(); i++)
7769       if (Instruction *Member = IG->getMember(i))
7770         RecipeBuilder.recordRecipeOf(Member);
7771   };
7772 
7773   // ---------------------------------------------------------------------------
7774   // Build initial VPlan: Scan the body of the loop in a topological order to
7775   // visit each basic block after having visited its predecessor basic blocks.
7776   // ---------------------------------------------------------------------------
7777 
7778   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7779   auto Plan = std::make_unique<VPlan>();
7780   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7781   Plan->setEntry(VPBB);
7782 
7783   // Scan the body of the loop in a topological order to visit each basic block
7784   // after having visited its predecessor basic blocks.
7785   LoopBlocksDFS DFS(OrigLoop);
7786   DFS.perform(LI);
7787 
7788   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7789     // Relevant instructions from basic block BB will be grouped into VPRecipe
7790     // ingredients and fill a new VPBasicBlock.
7791     unsigned VPBBsForBB = 0;
7792     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7793     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7794     VPBB = FirstVPBBForBB;
7795     Builder.setInsertPoint(VPBB);
7796 
7797     // Introduce each ingredient into VPlan.
7798     // TODO: Model and preserve debug instrinsics in VPlan.
7799     for (Instruction &I : BB->instructionsWithoutDebug()) {
7800       Instruction *Instr = &I;
7801 
7802       // First filter out irrelevant instructions, to ensure no recipes are
7803       // built for them.
7804       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
7805         continue;
7806 
7807       if (auto Recipe =
7808               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7809         // Check if the recipe can be converted to a VPValue. We need the extra
7810         // down-casting step until VPRecipeBase inherits from VPValue.
7811         VPValue *MaybeVPValue = Recipe->toVPValue();
7812         if (!Instr->getType()->isVoidTy() && MaybeVPValue)
7813           Plan->addVPValue(Instr, MaybeVPValue);
7814 
7815         RecipeBuilder.setRecipe(Instr, Recipe);
7816         VPBB->appendRecipe(Recipe);
7817         continue;
7818       }
7819 
7820       // Otherwise, if all widening options failed, Instruction is to be
7821       // replicated. This may create a successor for VPBB.
7822       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7823           Instr, Range, VPBB, PredInst2Recipe, Plan);
7824       if (NextVPBB != VPBB) {
7825         VPBB = NextVPBB;
7826         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7827                                     : "");
7828       }
7829     }
7830   }
7831 
7832   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7833   // may also be empty, such as the last one VPBB, reflecting original
7834   // basic-blocks with no recipes.
7835   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7836   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7837   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7838   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7839   delete PreEntry;
7840 
7841   // ---------------------------------------------------------------------------
7842   // Transform initial VPlan: Apply previously taken decisions, in order, to
7843   // bring the VPlan to its final state.
7844   // ---------------------------------------------------------------------------
7845 
7846   // Apply Sink-After legal constraints.
7847   for (auto &Entry : SinkAfter) {
7848     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7849     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7850     Sink->moveAfter(Target);
7851   }
7852 
7853   // Interleave memory: for each Interleave Group we marked earlier as relevant
7854   // for this VPlan, replace the Recipes widening its memory instructions with a
7855   // single VPInterleaveRecipe at its insertion point.
7856   for (auto IG : InterleaveGroups) {
7857     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7858         RecipeBuilder.getRecipe(IG->getInsertPos()));
7859     SmallVector<VPValue *, 4> StoredValues;
7860     for (unsigned i = 0; i < IG->getFactor(); ++i)
7861       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
7862         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
7863 
7864     (new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
7865                             Recipe->getMask()))
7866         ->insertBefore(Recipe);
7867 
7868     for (unsigned i = 0; i < IG->getFactor(); ++i)
7869       if (Instruction *Member = IG->getMember(i)) {
7870         if (!Member->getType()->isVoidTy()) {
7871           VPValue *OriginalV = Plan->getVPValue(Member);
7872           Plan->removeVPValueFor(Member);
7873           OriginalV->replaceAllUsesWith(Plan->getOrAddVPValue(Member));
7874         }
7875         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7876       }
7877   }
7878 
7879   // Adjust the recipes for any inloop reductions.
7880   if (Range.Start.isVector())
7881     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
7882 
7883   // Finally, if tail is folded by masking, introduce selects between the phi
7884   // and the live-out instruction of each reduction, at the end of the latch.
7885   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
7886     Builder.setInsertPoint(VPBB);
7887     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7888     for (auto &Reduction : Legal->getReductionVars()) {
7889       if (CM.isInLoopReduction(Reduction.first))
7890         continue;
7891       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
7892       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
7893       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7894     }
7895   }
7896 
7897   std::string PlanName;
7898   raw_string_ostream RSO(PlanName);
7899   ElementCount VF = Range.Start;
7900   Plan->addVF(VF);
7901   RSO << "Initial VPlan for VF={" << VF;
7902   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
7903     Plan->addVF(VF);
7904     RSO << "," << VF;
7905   }
7906   RSO << "},UF>=1";
7907   RSO.flush();
7908   Plan->setName(PlanName);
7909 
7910   return Plan;
7911 }
7912 
7913 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7914   // Outer loop handling: They may require CFG and instruction level
7915   // transformations before even evaluating whether vectorization is profitable.
7916   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7917   // the vectorization pipeline.
7918   assert(!OrigLoop->isInnermost());
7919   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7920 
7921   // Create new empty VPlan
7922   auto Plan = std::make_unique<VPlan>();
7923 
7924   // Build hierarchical CFG
7925   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7926   HCFGBuilder.buildHierarchicalCFG();
7927 
7928   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
7929        VF *= 2)
7930     Plan->addVF(VF);
7931 
7932   if (EnableVPlanPredication) {
7933     VPlanPredicator VPP(*Plan);
7934     VPP.predicate();
7935 
7936     // Avoid running transformation to recipes until masked code generation in
7937     // VPlan-native path is in place.
7938     return Plan;
7939   }
7940 
7941   SmallPtrSet<Instruction *, 1> DeadInstructions;
7942   VPlanTransforms::VPInstructionsToVPRecipes(
7943       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7944   return Plan;
7945 }
7946 
7947 // Adjust the recipes for any inloop reductions. The chain of instructions
7948 // leading from the loop exit instr to the phi need to be converted to
7949 // reductions, with one operand being vector and the other being the scalar
7950 // reduction chain.
7951 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
7952     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
7953   for (auto &Reduction : CM.getInLoopReductionChains()) {
7954     PHINode *Phi = Reduction.first;
7955     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
7956     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7957 
7958     // ReductionOperations are orders top-down from the phi's use to the
7959     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
7960     // which of the two operands will remain scalar and which will be reduced.
7961     // For minmax the chain will be the select instructions.
7962     Instruction *Chain = Phi;
7963     for (Instruction *R : ReductionOperations) {
7964       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
7965       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
7966 
7967       VPValue *ChainOp = Plan->getVPValue(Chain);
7968       unsigned FirstOpId;
7969       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7970           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7971         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
7972                "Expected to replace a VPWidenSelectSC");
7973         FirstOpId = 1;
7974       } else {
7975         assert(isa<VPWidenRecipe>(WidenRecipe) &&
7976                "Expected to replace a VPWidenSC");
7977         FirstOpId = 0;
7978       }
7979       unsigned VecOpId =
7980           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
7981       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
7982 
7983       auto *CondOp = CM.foldTailByMasking()
7984                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
7985                          : nullptr;
7986       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
7987           &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
7988       WidenRecipe->toVPValue()->replaceAllUsesWith(RedRecipe);
7989       Plan->removeVPValueFor(R);
7990       Plan->addVPValue(R, RedRecipe);
7991       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
7992       WidenRecipe->eraseFromParent();
7993 
7994       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7995           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7996         VPRecipeBase *CompareRecipe =
7997             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
7998         assert(isa<VPWidenRecipe>(CompareRecipe) &&
7999                "Expected to replace a VPWidenSC");
8000         assert(CompareRecipe->toVPValue()->getNumUsers() == 0 &&
8001                "Expected no remaining users");
8002         CompareRecipe->eraseFromParent();
8003       }
8004       Chain = R;
8005     }
8006   }
8007 }
8008 
8009 Value* LoopVectorizationPlanner::VPCallbackILV::
8010 getOrCreateVectorValues(Value *V, unsigned Part) {
8011       return ILV.getOrCreateVectorValue(V, Part);
8012 }
8013 
8014 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
8015     Value *V, const VPIteration &Instance) {
8016   return ILV.getOrCreateScalarValue(V, Instance);
8017 }
8018 
8019 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
8020                                VPSlotTracker &SlotTracker) const {
8021   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
8022   IG->getInsertPos()->printAsOperand(O, false);
8023   O << ", ";
8024   getAddr()->printAsOperand(O, SlotTracker);
8025   VPValue *Mask = getMask();
8026   if (Mask) {
8027     O << ", ";
8028     Mask->printAsOperand(O, SlotTracker);
8029   }
8030   for (unsigned i = 0; i < IG->getFactor(); ++i)
8031     if (Instruction *I = IG->getMember(i))
8032       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
8033 }
8034 
8035 void VPWidenCallRecipe::execute(VPTransformState &State) {
8036   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
8037                                   *this, State);
8038 }
8039 
8040 void VPWidenSelectRecipe::execute(VPTransformState &State) {
8041   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
8042                                     this, *this, InvariantCond, State);
8043 }
8044 
8045 void VPWidenRecipe::execute(VPTransformState &State) {
8046   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
8047 }
8048 
8049 void VPWidenGEPRecipe::execute(VPTransformState &State) {
8050   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
8051                       *this, State.UF, State.VF, IsPtrLoopInvariant,
8052                       IsIndexLoopInvariant, State);
8053 }
8054 
8055 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
8056   assert(!State.Instance && "Int or FP induction being replicated.");
8057   State.ILV->widenIntOrFpInduction(IV, Trunc);
8058 }
8059 
8060 void VPWidenPHIRecipe::execute(VPTransformState &State) {
8061   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
8062 }
8063 
8064 void VPBlendRecipe::execute(VPTransformState &State) {
8065   State.ILV->setDebugLocFromInst(State.Builder, Phi);
8066   // We know that all PHIs in non-header blocks are converted into
8067   // selects, so we don't have to worry about the insertion order and we
8068   // can just use the builder.
8069   // At this point we generate the predication tree. There may be
8070   // duplications since this is a simple recursive scan, but future
8071   // optimizations will clean it up.
8072 
8073   unsigned NumIncoming = getNumIncomingValues();
8074 
8075   // Generate a sequence of selects of the form:
8076   // SELECT(Mask3, In3,
8077   //        SELECT(Mask2, In2,
8078   //               SELECT(Mask1, In1,
8079   //                      In0)))
8080   // Note that Mask0 is never used: lanes for which no path reaches this phi and
8081   // are essentially undef are taken from In0.
8082   InnerLoopVectorizer::VectorParts Entry(State.UF);
8083   for (unsigned In = 0; In < NumIncoming; ++In) {
8084     for (unsigned Part = 0; Part < State.UF; ++Part) {
8085       // We might have single edge PHIs (blocks) - use an identity
8086       // 'select' for the first PHI operand.
8087       Value *In0 = State.get(getIncomingValue(In), Part);
8088       if (In == 0)
8089         Entry[Part] = In0; // Initialize with the first incoming value.
8090       else {
8091         // Select between the current value and the previous incoming edge
8092         // based on the incoming mask.
8093         Value *Cond = State.get(getMask(In), Part);
8094         Entry[Part] =
8095             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8096       }
8097     }
8098   }
8099   for (unsigned Part = 0; Part < State.UF; ++Part)
8100     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8101 }
8102 
8103 void VPInterleaveRecipe::execute(VPTransformState &State) {
8104   assert(!State.Instance && "Interleave group being replicated.");
8105   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getStoredValues(),
8106                                       getMask());
8107 }
8108 
8109 void VPReductionRecipe::execute(VPTransformState &State) {
8110   assert(!State.Instance && "Reduction being replicated.");
8111   for (unsigned Part = 0; Part < State.UF; ++Part) {
8112     RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind();
8113     Value *NewVecOp = State.get(getVecOp(), Part);
8114     if (VPValue *Cond = getCondOp()) {
8115       Value *NewCond = State.get(Cond, Part);
8116       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
8117       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
8118           Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType());
8119       Constant *IdenVec =
8120           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
8121       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
8122       NewVecOp = Select;
8123     }
8124     Value *NewRed =
8125         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
8126     Value *PrevInChain = State.get(getChainOp(), Part);
8127     Value *NextInChain;
8128     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8129         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8130       NextInChain =
8131           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
8132                          NewRed, PrevInChain);
8133     } else {
8134       NextInChain = State.Builder.CreateBinOp(
8135           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
8136           PrevInChain);
8137     }
8138     State.set(this, getUnderlyingInstr(), NextInChain, Part);
8139   }
8140 }
8141 
8142 void VPReplicateRecipe::execute(VPTransformState &State) {
8143   if (State.Instance) { // Generate a single instance.
8144     State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
8145                                     *State.Instance, IsPredicated, State);
8146     // Insert scalar instance packing it into a vector.
8147     if (AlsoPack && State.VF.isVector()) {
8148       // If we're constructing lane 0, initialize to start from undef.
8149       if (State.Instance->Lane == 0) {
8150         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8151         Value *Undef = UndefValue::get(
8152             VectorType::get(getUnderlyingValue()->getType(), State.VF));
8153         State.ValueMap.setVectorValue(getUnderlyingInstr(),
8154                                       State.Instance->Part, Undef);
8155       }
8156       State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
8157                                            *State.Instance);
8158     }
8159     return;
8160   }
8161 
8162   // Generate scalar instances for all VF lanes of all UF parts, unless the
8163   // instruction is uniform inwhich case generate only the first lane for each
8164   // of the UF parts.
8165   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8166   for (unsigned Part = 0; Part < State.UF; ++Part)
8167     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8168       State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
8169                                       IsPredicated, State);
8170 }
8171 
8172 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8173   assert(State.Instance && "Branch on Mask works only on single instance.");
8174 
8175   unsigned Part = State.Instance->Part;
8176   unsigned Lane = State.Instance->Lane;
8177 
8178   Value *ConditionBit = nullptr;
8179   VPValue *BlockInMask = getMask();
8180   if (BlockInMask) {
8181     ConditionBit = State.get(BlockInMask, Part);
8182     if (ConditionBit->getType()->isVectorTy())
8183       ConditionBit = State.Builder.CreateExtractElement(
8184           ConditionBit, State.Builder.getInt32(Lane));
8185   } else // Block in mask is all-one.
8186     ConditionBit = State.Builder.getTrue();
8187 
8188   // Replace the temporary unreachable terminator with a new conditional branch,
8189   // whose two destinations will be set later when they are created.
8190   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8191   assert(isa<UnreachableInst>(CurrentTerminator) &&
8192          "Expected to replace unreachable terminator with conditional branch.");
8193   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8194   CondBr->setSuccessor(0, nullptr);
8195   ReplaceInstWithInst(CurrentTerminator, CondBr);
8196 }
8197 
8198 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8199   assert(State.Instance && "Predicated instruction PHI works per instance.");
8200   Instruction *ScalarPredInst =
8201       cast<Instruction>(State.get(getOperand(0), *State.Instance));
8202   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8203   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8204   assert(PredicatingBB && "Predicated block has no single predecessor.");
8205 
8206   // By current pack/unpack logic we need to generate only a single phi node: if
8207   // a vector value for the predicated instruction exists at this point it means
8208   // the instruction has vector users only, and a phi for the vector value is
8209   // needed. In this case the recipe of the predicated instruction is marked to
8210   // also do that packing, thereby "hoisting" the insert-element sequence.
8211   // Otherwise, a phi node for the scalar value is needed.
8212   unsigned Part = State.Instance->Part;
8213   Instruction *PredInst =
8214       cast<Instruction>(getOperand(0)->getUnderlyingValue());
8215   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8216     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8217     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8218     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8219     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8220     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8221     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8222   } else {
8223     Type *PredInstType = PredInst->getType();
8224     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8225     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8226     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8227     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8228   }
8229 }
8230 
8231 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8232   Instruction *Instr = getUnderlyingInstr();
8233   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
8234   State.ILV->vectorizeMemoryInstruction(Instr, State,
8235                                         StoredValue ? nullptr : this, getAddr(),
8236                                         StoredValue, getMask());
8237 }
8238 
8239 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8240 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8241 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8242 // for predication.
8243 static ScalarEpilogueLowering getScalarEpilogueLowering(
8244     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8245     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8246     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8247     LoopVectorizationLegality &LVL) {
8248   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8249   // don't look at hints or options, and don't request a scalar epilogue.
8250   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8251   // LoopAccessInfo (due to code dependency and not being able to reliably get
8252   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8253   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8254   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8255   // back to the old way and vectorize with versioning when forced. See D81345.)
8256   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8257                                                       PGSOQueryType::IRPass) &&
8258                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8259     return CM_ScalarEpilogueNotAllowedOptSize;
8260 
8261   bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() &&
8262                               !PreferPredicateOverEpilogue;
8263 
8264   // 2) Next, if disabling predication is requested on the command line, honour
8265   // this and request a scalar epilogue.
8266   if (PredicateOptDisabled)
8267     return CM_ScalarEpilogueAllowed;
8268 
8269   // 3) and 4) look if enabling predication is requested on the command line,
8270   // with a loop hint, or if the TTI hook indicates this is profitable, request
8271   // predication.
8272   if (PreferPredicateOverEpilogue ||
8273       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
8274       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8275                                         LVL.getLAI()) &&
8276        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
8277     return CM_ScalarEpilogueNotNeededUsePredicate;
8278 
8279   return CM_ScalarEpilogueAllowed;
8280 }
8281 
8282 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
8283                            unsigned Part) {
8284   set(Def, V, Part);
8285   ILV->setVectorValue(IRDef, Part, V);
8286 }
8287 
8288 // Process the loop in the VPlan-native vectorization path. This path builds
8289 // VPlan upfront in the vectorization pipeline, which allows to apply
8290 // VPlan-to-VPlan transformations from the very beginning without modifying the
8291 // input LLVM IR.
8292 static bool processLoopInVPlanNativePath(
8293     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8294     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8295     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8296     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8297     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8298 
8299   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
8300     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8301     return false;
8302   }
8303   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8304   Function *F = L->getHeader()->getParent();
8305   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8306 
8307   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8308       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8309 
8310   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8311                                 &Hints, IAI);
8312   // Use the planner for outer loop vectorization.
8313   // TODO: CM is not used at this point inside the planner. Turn CM into an
8314   // optional argument if we don't need it in the future.
8315   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8316 
8317   // Get user vectorization factor.
8318   const unsigned UserVF = Hints.getWidth();
8319 
8320   // Plan how to best vectorize, return the best VF and its cost.
8321   const VectorizationFactor VF =
8322       LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF));
8323 
8324   // If we are stress testing VPlan builds, do not attempt to generate vector
8325   // code. Masked vector code generation support will follow soon.
8326   // Also, do not attempt to vectorize if no vector code will be produced.
8327   if (VPlanBuildStressTest || EnableVPlanPredication ||
8328       VectorizationFactor::Disabled() == VF)
8329     return false;
8330 
8331   LVP.setBestPlan(VF.Width, 1);
8332 
8333   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8334                          &CM, BFI, PSI);
8335   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8336                     << L->getHeader()->getParent()->getName() << "\"\n");
8337   LVP.executePlan(LB, DT);
8338 
8339   // Mark the loop as already vectorized to avoid vectorizing again.
8340   Hints.setAlreadyVectorized();
8341 
8342   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8343   return true;
8344 }
8345 
8346 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8347     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8348                                !EnableLoopInterleaving),
8349       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8350                               !EnableLoopVectorization) {}
8351 
8352 bool LoopVectorizePass::processLoop(Loop *L) {
8353   assert((EnableVPlanNativePath || L->isInnermost()) &&
8354          "VPlan-native path is not enabled. Only process inner loops.");
8355 
8356 #ifndef NDEBUG
8357   const std::string DebugLocStr = getDebugLocString(L);
8358 #endif /* NDEBUG */
8359 
8360   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8361                     << L->getHeader()->getParent()->getName() << "\" from "
8362                     << DebugLocStr << "\n");
8363 
8364   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8365 
8366   LLVM_DEBUG(
8367       dbgs() << "LV: Loop hints:"
8368              << " force="
8369              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8370                      ? "disabled"
8371                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8372                             ? "enabled"
8373                             : "?"))
8374              << " width=" << Hints.getWidth()
8375              << " unroll=" << Hints.getInterleave() << "\n");
8376 
8377   // Function containing loop
8378   Function *F = L->getHeader()->getParent();
8379 
8380   // Looking at the diagnostic output is the only way to determine if a loop
8381   // was vectorized (other than looking at the IR or machine code), so it
8382   // is important to generate an optimization remark for each loop. Most of
8383   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8384   // generated as OptimizationRemark and OptimizationRemarkMissed are
8385   // less verbose reporting vectorized loops and unvectorized loops that may
8386   // benefit from vectorization, respectively.
8387 
8388   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8389     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8390     return false;
8391   }
8392 
8393   PredicatedScalarEvolution PSE(*SE, *L);
8394 
8395   // Check if it is legal to vectorize the loop.
8396   LoopVectorizationRequirements Requirements(*ORE);
8397   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
8398                                 &Requirements, &Hints, DB, AC, BFI, PSI);
8399   if (!LVL.canVectorize(EnableVPlanNativePath)) {
8400     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8401     Hints.emitRemarkWithHints();
8402     return false;
8403   }
8404 
8405   // Check the function attributes and profiles to find out if this function
8406   // should be optimized for size.
8407   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8408       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
8409 
8410   // Entrance to the VPlan-native vectorization path. Outer loops are processed
8411   // here. They may require CFG and instruction level transformations before
8412   // even evaluating whether vectorization is profitable. Since we cannot modify
8413   // the incoming IR, we need to build VPlan upfront in the vectorization
8414   // pipeline.
8415   if (!L->isInnermost())
8416     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
8417                                         ORE, BFI, PSI, Hints);
8418 
8419   assert(L->isInnermost() && "Inner loop expected.");
8420 
8421   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8422   // count by optimizing for size, to minimize overheads.
8423   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
8424   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
8425     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
8426                       << "This loop is worth vectorizing only if no scalar "
8427                       << "iteration overheads are incurred.");
8428     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
8429       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
8430     else {
8431       LLVM_DEBUG(dbgs() << "\n");
8432       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
8433     }
8434   }
8435 
8436   // Check the function attributes to see if implicit floats are allowed.
8437   // FIXME: This check doesn't seem possibly correct -- what if the loop is
8438   // an integer loop and the vector instructions selected are purely integer
8439   // vector instructions?
8440   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
8441     reportVectorizationFailure(
8442         "Can't vectorize when the NoImplicitFloat attribute is used",
8443         "loop not vectorized due to NoImplicitFloat attribute",
8444         "NoImplicitFloat", ORE, L);
8445     Hints.emitRemarkWithHints();
8446     return false;
8447   }
8448 
8449   // Check if the target supports potentially unsafe FP vectorization.
8450   // FIXME: Add a check for the type of safety issue (denormal, signaling)
8451   // for the target we're vectorizing for, to make sure none of the
8452   // additional fp-math flags can help.
8453   if (Hints.isPotentiallyUnsafe() &&
8454       TTI->isFPVectorizationPotentiallyUnsafe()) {
8455     reportVectorizationFailure(
8456         "Potentially unsafe FP op prevents vectorization",
8457         "loop not vectorized due to unsafe FP support.",
8458         "UnsafeFP", ORE, L);
8459     Hints.emitRemarkWithHints();
8460     return false;
8461   }
8462 
8463   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
8464   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
8465 
8466   // If an override option has been passed in for interleaved accesses, use it.
8467   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
8468     UseInterleaved = EnableInterleavedMemAccesses;
8469 
8470   // Analyze interleaved memory accesses.
8471   if (UseInterleaved) {
8472     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
8473   }
8474 
8475   // Use the cost model.
8476   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
8477                                 F, &Hints, IAI);
8478   CM.collectValuesToIgnore();
8479 
8480   // Use the planner for vectorization.
8481   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
8482 
8483   // Get user vectorization factor and interleave count.
8484   unsigned UserVF = Hints.getWidth();
8485   unsigned UserIC = Hints.getInterleave();
8486 
8487   // Plan how to best vectorize, return the best VF and its cost.
8488   Optional<VectorizationFactor> MaybeVF =
8489       LVP.plan(ElementCount::getFixed(UserVF), UserIC);
8490 
8491   VectorizationFactor VF = VectorizationFactor::Disabled();
8492   unsigned IC = 1;
8493 
8494   if (MaybeVF) {
8495     VF = *MaybeVF;
8496     // Select the interleave count.
8497     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
8498   }
8499 
8500   // Identify the diagnostic messages that should be produced.
8501   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8502   bool VectorizeLoop = true, InterleaveLoop = true;
8503   if (Requirements.doesNotMeet(F, L, Hints)) {
8504     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
8505                          "requirements.\n");
8506     Hints.emitRemarkWithHints();
8507     return false;
8508   }
8509 
8510   if (VF.Width.isScalar()) {
8511     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8512     VecDiagMsg = std::make_pair(
8513         "VectorizationNotBeneficial",
8514         "the cost-model indicates that vectorization is not beneficial");
8515     VectorizeLoop = false;
8516   }
8517 
8518   if (!MaybeVF && UserIC > 1) {
8519     // Tell the user interleaving was avoided up-front, despite being explicitly
8520     // requested.
8521     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
8522                          "interleaving should be avoided up front\n");
8523     IntDiagMsg = std::make_pair(
8524         "InterleavingAvoided",
8525         "Ignoring UserIC, because interleaving was avoided up front");
8526     InterleaveLoop = false;
8527   } else if (IC == 1 && UserIC <= 1) {
8528     // Tell the user interleaving is not beneficial.
8529     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8530     IntDiagMsg = std::make_pair(
8531         "InterleavingNotBeneficial",
8532         "the cost-model indicates that interleaving is not beneficial");
8533     InterleaveLoop = false;
8534     if (UserIC == 1) {
8535       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8536       IntDiagMsg.second +=
8537           " and is explicitly disabled or interleave count is set to 1";
8538     }
8539   } else if (IC > 1 && UserIC == 1) {
8540     // Tell the user interleaving is beneficial, but it explicitly disabled.
8541     LLVM_DEBUG(
8542         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
8543     IntDiagMsg = std::make_pair(
8544         "InterleavingBeneficialButDisabled",
8545         "the cost-model indicates that interleaving is beneficial "
8546         "but is explicitly disabled or interleave count is set to 1");
8547     InterleaveLoop = false;
8548   }
8549 
8550   // Override IC if user provided an interleave count.
8551   IC = UserIC > 0 ? UserIC : IC;
8552 
8553   // Emit diagnostic messages, if any.
8554   const char *VAPassName = Hints.vectorizeAnalysisPassName();
8555   if (!VectorizeLoop && !InterleaveLoop) {
8556     // Do not vectorize or interleaving the loop.
8557     ORE->emit([&]() {
8558       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
8559                                       L->getStartLoc(), L->getHeader())
8560              << VecDiagMsg.second;
8561     });
8562     ORE->emit([&]() {
8563       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8564                                       L->getStartLoc(), L->getHeader())
8565              << IntDiagMsg.second;
8566     });
8567     return false;
8568   } else if (!VectorizeLoop && InterleaveLoop) {
8569     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8570     ORE->emit([&]() {
8571       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
8572                                         L->getStartLoc(), L->getHeader())
8573              << VecDiagMsg.second;
8574     });
8575   } else if (VectorizeLoop && !InterleaveLoop) {
8576     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8577                       << ") in " << DebugLocStr << '\n');
8578     ORE->emit([&]() {
8579       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8580                                         L->getStartLoc(), L->getHeader())
8581              << IntDiagMsg.second;
8582     });
8583   } else if (VectorizeLoop && InterleaveLoop) {
8584     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8585                       << ") in " << DebugLocStr << '\n');
8586     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8587   }
8588 
8589   LVP.setBestPlan(VF.Width, IC);
8590 
8591   using namespace ore;
8592   bool DisableRuntimeUnroll = false;
8593   MDNode *OrigLoopID = L->getLoopID();
8594 
8595   if (!VectorizeLoop) {
8596     assert(IC > 1 && "interleave count should not be 1 or 0");
8597     // If we decided that it is not legal to vectorize the loop, then
8598     // interleave it.
8599     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
8600                                BFI, PSI);
8601     LVP.executePlan(Unroller, DT);
8602 
8603     ORE->emit([&]() {
8604       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8605                                 L->getHeader())
8606              << "interleaved loop (interleaved count: "
8607              << NV("InterleaveCount", IC) << ")";
8608     });
8609   } else {
8610     // If we decided that it is *legal* to vectorize the loop, then do it.
8611     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
8612                            &LVL, &CM, BFI, PSI);
8613     LVP.executePlan(LB, DT);
8614     ++LoopsVectorized;
8615 
8616     // Add metadata to disable runtime unrolling a scalar loop when there are
8617     // no runtime checks about strides and memory. A scalar loop that is
8618     // rarely used is not worth unrolling.
8619     if (!LB.areSafetyChecksAdded())
8620       DisableRuntimeUnroll = true;
8621 
8622     // Report the vectorization decision.
8623     ORE->emit([&]() {
8624       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
8625                                 L->getHeader())
8626              << "vectorized loop (vectorization width: "
8627              << NV("VectorizationFactor", VF.Width)
8628              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
8629     });
8630   }
8631 
8632   Optional<MDNode *> RemainderLoopID =
8633       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
8634                                       LLVMLoopVectorizeFollowupEpilogue});
8635   if (RemainderLoopID.hasValue()) {
8636     L->setLoopID(RemainderLoopID.getValue());
8637   } else {
8638     if (DisableRuntimeUnroll)
8639       AddRuntimeUnrollDisableMetaData(L);
8640 
8641     // Mark the loop as already vectorized to avoid vectorizing again.
8642     Hints.setAlreadyVectorized();
8643   }
8644 
8645   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8646   return true;
8647 }
8648 
8649 LoopVectorizeResult LoopVectorizePass::runImpl(
8650     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8651     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8652     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
8653     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8654     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
8655   SE = &SE_;
8656   LI = &LI_;
8657   TTI = &TTI_;
8658   DT = &DT_;
8659   BFI = &BFI_;
8660   TLI = TLI_;
8661   AA = &AA_;
8662   AC = &AC_;
8663   GetLAA = &GetLAA_;
8664   DB = &DB_;
8665   ORE = &ORE_;
8666   PSI = PSI_;
8667 
8668   // Don't attempt if
8669   // 1. the target claims to have no vector registers, and
8670   // 2. interleaving won't help ILP.
8671   //
8672   // The second condition is necessary because, even if the target has no
8673   // vector registers, loop vectorization may still enable scalar
8674   // interleaving.
8675   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8676       TTI->getMaxInterleaveFactor(1) < 2)
8677     return LoopVectorizeResult(false, false);
8678 
8679   bool Changed = false, CFGChanged = false;
8680 
8681   // The vectorizer requires loops to be in simplified form.
8682   // Since simplification may add new inner loops, it has to run before the
8683   // legality and profitability checks. This means running the loop vectorizer
8684   // will simplify all loops, regardless of whether anything end up being
8685   // vectorized.
8686   for (auto &L : *LI)
8687     Changed |= CFGChanged |=
8688         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8689 
8690   // Build up a worklist of inner-loops to vectorize. This is necessary as
8691   // the act of vectorizing or partially unrolling a loop creates new loops
8692   // and can invalidate iterators across the loops.
8693   SmallVector<Loop *, 8> Worklist;
8694 
8695   for (Loop *L : *LI)
8696     collectSupportedLoops(*L, LI, ORE, Worklist);
8697 
8698   LoopsAnalyzed += Worklist.size();
8699 
8700   // Now walk the identified inner loops.
8701   while (!Worklist.empty()) {
8702     Loop *L = Worklist.pop_back_val();
8703 
8704     // For the inner loops we actually process, form LCSSA to simplify the
8705     // transform.
8706     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8707 
8708     Changed |= CFGChanged |= processLoop(L);
8709   }
8710 
8711   // Process each loop nest in the function.
8712   return LoopVectorizeResult(Changed, CFGChanged);
8713 }
8714 
8715 PreservedAnalyses LoopVectorizePass::run(Function &F,
8716                                          FunctionAnalysisManager &AM) {
8717     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8718     auto &LI = AM.getResult<LoopAnalysis>(F);
8719     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8720     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8721     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8722     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8723     auto &AA = AM.getResult<AAManager>(F);
8724     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8725     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8726     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8727     MemorySSA *MSSA = EnableMSSALoopDependency
8728                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8729                           : nullptr;
8730 
8731     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8732     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8733         [&](Loop &L) -> const LoopAccessInfo & {
8734       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
8735                                         TLI, TTI, nullptr, MSSA};
8736       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8737     };
8738     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8739     ProfileSummaryInfo *PSI =
8740         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8741     LoopVectorizeResult Result =
8742         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8743     if (!Result.MadeAnyChange)
8744       return PreservedAnalyses::all();
8745     PreservedAnalyses PA;
8746 
8747     // We currently do not preserve loopinfo/dominator analyses with outer loop
8748     // vectorization. Until this is addressed, mark these analyses as preserved
8749     // only for non-VPlan-native path.
8750     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8751     if (!EnableVPlanNativePath) {
8752       PA.preserve<LoopAnalysis>();
8753       PA.preserve<DominatorTreeAnalysis>();
8754     }
8755     PA.preserve<BasicAA>();
8756     PA.preserve<GlobalsAA>();
8757     if (!Result.MadeCFGChange)
8758       PA.preserveSet<CFGAnalyses>();
8759     return PA;
8760 }
8761