1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 /// @{
161 /// Metadata attribute names
162 static const char *const LLVMLoopVectorizeFollowupAll =
163     "llvm.loop.vectorize.followup_all";
164 static const char *const LLVMLoopVectorizeFollowupVectorized =
165     "llvm.loop.vectorize.followup_vectorized";
166 static const char *const LLVMLoopVectorizeFollowupEpilogue =
167     "llvm.loop.vectorize.followup_epilogue";
168 /// @}
169 
170 STATISTIC(LoopsVectorized, "Number of loops vectorized");
171 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172 
173 /// Loops with a known constant trip count below this number are vectorized only
174 /// if no scalar iteration overheads are incurred.
175 static cl::opt<unsigned> TinyTripCountVectorThreshold(
176     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
177     cl::desc("Loops with a constant trip count that is smaller than this "
178              "value are vectorized only if no scalar iteration overheads "
179              "are incurred."));
180 
181 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
182 // that predication is preferred, and this lists all options. I.e., the
183 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
184 // and predicate the instructions accordingly. If tail-folding fails, there are
185 // different fallback strategies depending on these values:
186 namespace PreferPredicateTy {
187   enum Option {
188     ScalarEpilogue = 0,
189     PredicateElseScalarEpilogue,
190     PredicateOrDontVectorize
191   };
192 } // namespace PreferPredicateTy
193 
194 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
195     "prefer-predicate-over-epilogue",
196     cl::init(PreferPredicateTy::ScalarEpilogue),
197     cl::Hidden,
198     cl::desc("Tail-folding and predication preferences over creating a scalar "
199              "epilogue loop."),
200     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
201                          "scalar-epilogue",
202                          "Don't tail-predicate loops, create scalar epilogue"),
203               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
204                          "predicate-else-scalar-epilogue",
205                          "prefer tail-folding, create scalar epilogue if tail "
206                          "folding fails."),
207               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
208                          "predicate-dont-vectorize",
209                          "prefers tail-folding, don't attempt vectorization if "
210                          "tail-folding fails.")));
211 
212 static cl::opt<bool> MaximizeBandwidth(
213     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
214     cl::desc("Maximize bandwidth when selecting vectorization factor which "
215              "will be determined by the smallest type in loop."));
216 
217 static cl::opt<bool> EnableInterleavedMemAccesses(
218     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
219     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
220 
221 /// An interleave-group may need masking if it resides in a block that needs
222 /// predication, or in order to mask away gaps.
223 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
224     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
225     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
226 
227 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
228     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
229     cl::desc("We don't interleave loops with a estimated constant trip count "
230              "below this number"));
231 
232 static cl::opt<unsigned> ForceTargetNumScalarRegs(
233     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
234     cl::desc("A flag that overrides the target's number of scalar registers."));
235 
236 static cl::opt<unsigned> ForceTargetNumVectorRegs(
237     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
238     cl::desc("A flag that overrides the target's number of vector registers."));
239 
240 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
241     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
242     cl::desc("A flag that overrides the target's max interleave factor for "
243              "scalar loops."));
244 
245 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
246     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
247     cl::desc("A flag that overrides the target's max interleave factor for "
248              "vectorized loops."));
249 
250 static cl::opt<unsigned> ForceTargetInstructionCost(
251     "force-target-instruction-cost", cl::init(0), cl::Hidden,
252     cl::desc("A flag that overrides the target's expected cost for "
253              "an instruction to a single constant value. Mostly "
254              "useful for getting consistent testing."));
255 
256 static cl::opt<unsigned> SmallLoopCost(
257     "small-loop-cost", cl::init(20), cl::Hidden,
258     cl::desc(
259         "The cost of a loop that is considered 'small' by the interleaver."));
260 
261 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
262     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
263     cl::desc("Enable the use of the block frequency analysis to access PGO "
264              "heuristics minimizing code growth in cold regions and being more "
265              "aggressive in hot regions."));
266 
267 // Runtime interleave loops for load/store throughput.
268 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
269     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
270     cl::desc(
271         "Enable runtime interleaving until load/store ports are saturated"));
272 
273 /// Interleave small loops with scalar reductions.
274 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
275     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
276     cl::desc("Enable interleaving for loops with small iteration counts that "
277              "contain scalar reductions to expose ILP."));
278 
279 /// The number of stores in a loop that are allowed to need predication.
280 static cl::opt<unsigned> NumberOfStoresToPredicate(
281     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
282     cl::desc("Max number of stores to be predicated behind an if."));
283 
284 static cl::opt<bool> EnableIndVarRegisterHeur(
285     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
286     cl::desc("Count the induction variable only once when interleaving"));
287 
288 static cl::opt<bool> EnableCondStoresVectorization(
289     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
290     cl::desc("Enable if predication of stores during vectorization."));
291 
292 static cl::opt<unsigned> MaxNestedScalarReductionIC(
293     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
294     cl::desc("The maximum interleave count to use when interleaving a scalar "
295              "reduction in a nested loop."));
296 
297 static cl::opt<bool>
298     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
299                            cl::Hidden,
300                            cl::desc("Prefer in-loop vector reductions, "
301                                     "overriding the targets preference."));
302 
303 static cl::opt<bool> PreferPredicatedReductionSelect(
304     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
305     cl::desc(
306         "Prefer predicating a reduction operation over an after loop select."));
307 
308 cl::opt<bool> EnableVPlanNativePath(
309     "enable-vplan-native-path", cl::init(false), cl::Hidden,
310     cl::desc("Enable VPlan-native vectorization path with "
311              "support for outer loop vectorization."));
312 
313 // FIXME: Remove this switch once we have divergence analysis. Currently we
314 // assume divergent non-backedge branches when this switch is true.
315 cl::opt<bool> EnableVPlanPredication(
316     "enable-vplan-predication", cl::init(false), cl::Hidden,
317     cl::desc("Enable VPlan-native vectorization path predicator with "
318              "support for outer loop vectorization."));
319 
320 // This flag enables the stress testing of the VPlan H-CFG construction in the
321 // VPlan-native vectorization path. It must be used in conjuction with
322 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
323 // verification of the H-CFGs built.
324 static cl::opt<bool> VPlanBuildStressTest(
325     "vplan-build-stress-test", cl::init(false), cl::Hidden,
326     cl::desc(
327         "Build VPlan for every supported loop nest in the function and bail "
328         "out right after the build (stress test the VPlan H-CFG construction "
329         "in the VPlan-native vectorization path)."));
330 
331 cl::opt<bool> llvm::EnableLoopInterleaving(
332     "interleave-loops", cl::init(true), cl::Hidden,
333     cl::desc("Enable loop interleaving in Loop vectorization passes"));
334 cl::opt<bool> llvm::EnableLoopVectorization(
335     "vectorize-loops", cl::init(true), cl::Hidden,
336     cl::desc("Run the Loop vectorization passes"));
337 
338 /// A helper function that returns the type of loaded or stored value.
339 static Type *getMemInstValueType(Value *I) {
340   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
341          "Expected Load or Store instruction");
342   if (auto *LI = dyn_cast<LoadInst>(I))
343     return LI->getType();
344   return cast<StoreInst>(I)->getValueOperand()->getType();
345 }
346 
347 /// A helper function that returns true if the given type is irregular. The
348 /// type is irregular if its allocated size doesn't equal the store size of an
349 /// element of the corresponding vector type at the given vectorization factor.
350 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
351   assert(!VF.isScalable() && "scalable vectors not yet supported.");
352   // Determine if an array of VF elements of type Ty is "bitcast compatible"
353   // with a <VF x Ty> vector.
354   if (VF.isVector()) {
355     auto *VectorTy = VectorType::get(Ty, VF);
356     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
357   }
358 
359   // If the vectorization factor is one, we just check if an array of type Ty
360   // requires padding between elements.
361   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
362 }
363 
364 /// A helper function that returns the reciprocal of the block probability of
365 /// predicated blocks. If we return X, we are assuming the predicated block
366 /// will execute once for every X iterations of the loop header.
367 ///
368 /// TODO: We should use actual block probability here, if available. Currently,
369 ///       we always assume predicated blocks have a 50% chance of executing.
370 static unsigned getReciprocalPredBlockProb() { return 2; }
371 
372 /// A helper function that adds a 'fast' flag to floating-point operations.
373 static Value *addFastMathFlag(Value *V) {
374   if (isa<FPMathOperator>(V))
375     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
376   return V;
377 }
378 
379 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
380   if (isa<FPMathOperator>(V))
381     cast<Instruction>(V)->setFastMathFlags(FMF);
382   return V;
383 }
384 
385 /// A helper function that returns an integer or floating-point constant with
386 /// value C.
387 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
388   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
389                            : ConstantFP::get(Ty, C);
390 }
391 
392 /// Returns "best known" trip count for the specified loop \p L as defined by
393 /// the following procedure:
394 ///   1) Returns exact trip count if it is known.
395 ///   2) Returns expected trip count according to profile data if any.
396 ///   3) Returns upper bound estimate if it is known.
397 ///   4) Returns None if all of the above failed.
398 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
399   // Check if exact trip count is known.
400   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
401     return ExpectedTC;
402 
403   // Check if there is an expected trip count available from profile data.
404   if (LoopVectorizeWithBlockFrequency)
405     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
406       return EstimatedTC;
407 
408   // Check if upper bound estimate is known.
409   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
410     return ExpectedTC;
411 
412   return None;
413 }
414 
415 namespace llvm {
416 
417 /// InnerLoopVectorizer vectorizes loops which contain only one basic
418 /// block to a specified vectorization factor (VF).
419 /// This class performs the widening of scalars into vectors, or multiple
420 /// scalars. This class also implements the following features:
421 /// * It inserts an epilogue loop for handling loops that don't have iteration
422 ///   counts that are known to be a multiple of the vectorization factor.
423 /// * It handles the code generation for reduction variables.
424 /// * Scalarization (implementation using scalars) of un-vectorizable
425 ///   instructions.
426 /// InnerLoopVectorizer does not perform any vectorization-legality
427 /// checks, and relies on the caller to check for the different legality
428 /// aspects. The InnerLoopVectorizer relies on the
429 /// LoopVectorizationLegality class to provide information about the induction
430 /// and reduction variables that were found to a given vectorization factor.
431 class InnerLoopVectorizer {
432 public:
433   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
434                       LoopInfo *LI, DominatorTree *DT,
435                       const TargetLibraryInfo *TLI,
436                       const TargetTransformInfo *TTI, AssumptionCache *AC,
437                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
438                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
439                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
440                       ProfileSummaryInfo *PSI)
441       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
442         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
443         Builder(PSE.getSE()->getContext()),
444         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
445         BFI(BFI), PSI(PSI) {
446     // Query this against the original loop and save it here because the profile
447     // of the original loop header may change as the transformation happens.
448     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
449         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
450   }
451 
452   virtual ~InnerLoopVectorizer() = default;
453 
454   /// Create a new empty loop that will contain vectorized instructions later
455   /// on, while the old loop will be used as the scalar remainder. Control flow
456   /// is generated around the vectorized (and scalar epilogue) loops consisting
457   /// of various checks and bypasses. Return the pre-header block of the new
458   /// loop.
459   BasicBlock *createVectorizedLoopSkeleton();
460 
461   /// Widen a single instruction within the innermost loop.
462   void widenInstruction(Instruction &I, VPUser &Operands,
463                         VPTransformState &State);
464 
465   /// Widen a single call instruction within the innermost loop.
466   void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
467                             VPTransformState &State);
468 
469   /// Widen a single select instruction within the innermost loop.
470   void widenSelectInstruction(SelectInst &I, VPUser &Operands,
471                               bool InvariantCond, VPTransformState &State);
472 
473   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
474   void fixVectorizedLoop();
475 
476   // Return true if any runtime check is added.
477   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
478 
479   /// A type for vectorized values in the new loop. Each value from the
480   /// original loop, when vectorized, is represented by UF vector values in the
481   /// new unrolled loop, where UF is the unroll factor.
482   using VectorParts = SmallVector<Value *, 2>;
483 
484   /// Vectorize a single GetElementPtrInst based on information gathered and
485   /// decisions taken during planning.
486   void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
487                 ElementCount VF, bool IsPtrLoopInvariant,
488                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
489 
490   /// Vectorize a single PHINode in a block. This method handles the induction
491   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
492   /// arbitrary length vectors.
493   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
494 
495   /// A helper function to scalarize a single Instruction in the innermost loop.
496   /// Generates a sequence of scalar instances for each lane between \p MinLane
497   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
498   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
499   /// Instr's operands.
500   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
501                             const VPIteration &Instance, bool IfPredicateInstr,
502                             VPTransformState &State);
503 
504   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
505   /// is provided, the integer induction variable will first be truncated to
506   /// the corresponding type.
507   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
508 
509   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
510   /// vector or scalar value on-demand if one is not yet available. When
511   /// vectorizing a loop, we visit the definition of an instruction before its
512   /// uses. When visiting the definition, we either vectorize or scalarize the
513   /// instruction, creating an entry for it in the corresponding map. (In some
514   /// cases, such as induction variables, we will create both vector and scalar
515   /// entries.) Then, as we encounter uses of the definition, we derive values
516   /// for each scalar or vector use unless such a value is already available.
517   /// For example, if we scalarize a definition and one of its uses is vector,
518   /// we build the required vector on-demand with an insertelement sequence
519   /// when visiting the use. Otherwise, if the use is scalar, we can use the
520   /// existing scalar definition.
521   ///
522   /// Return a value in the new loop corresponding to \p V from the original
523   /// loop at unroll index \p Part. If the value has already been vectorized,
524   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
525   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
526   /// a new vector value on-demand by inserting the scalar values into a vector
527   /// with an insertelement sequence. If the value has been neither vectorized
528   /// nor scalarized, it must be loop invariant, so we simply broadcast the
529   /// value into a vector.
530   Value *getOrCreateVectorValue(Value *V, unsigned Part);
531 
532   /// Return a value in the new loop corresponding to \p V from the original
533   /// loop at unroll and vector indices \p Instance. If the value has been
534   /// vectorized but not scalarized, the necessary extractelement instruction
535   /// will be generated.
536   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
537 
538   /// Construct the vector value of a scalarized value \p V one lane at a time.
539   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
540 
541   /// Try to vectorize interleaved access group \p Group with the base address
542   /// given in \p Addr, optionally masking the vector operations if \p
543   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
544   /// values in the vectorized loop.
545   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
546                                 VPTransformState &State, VPValue *Addr,
547                                 VPValue *BlockInMask = nullptr);
548 
549   /// Vectorize Load and Store instructions with the base address given in \p
550   /// Addr, optionally masking the vector operations if \p BlockInMask is
551   /// non-null. Use \p State to translate given VPValues to IR values in the
552   /// vectorized loop.
553   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
554                                   VPValue *Addr, VPValue *StoredValue,
555                                   VPValue *BlockInMask);
556 
557   /// Set the debug location in the builder using the debug location in
558   /// the instruction.
559   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
560 
561   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
562   void fixNonInductionPHIs(void);
563 
564 protected:
565   friend class LoopVectorizationPlanner;
566 
567   /// A small list of PHINodes.
568   using PhiVector = SmallVector<PHINode *, 4>;
569 
570   /// A type for scalarized values in the new loop. Each value from the
571   /// original loop, when scalarized, is represented by UF x VF scalar values
572   /// in the new unrolled loop, where UF is the unroll factor and VF is the
573   /// vectorization factor.
574   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
575 
576   /// Set up the values of the IVs correctly when exiting the vector loop.
577   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
578                     Value *CountRoundDown, Value *EndValue,
579                     BasicBlock *MiddleBlock);
580 
581   /// Create a new induction variable inside L.
582   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
583                                    Value *Step, Instruction *DL);
584 
585   /// Handle all cross-iteration phis in the header.
586   void fixCrossIterationPHIs();
587 
588   /// Fix a first-order recurrence. This is the second phase of vectorizing
589   /// this phi node.
590   void fixFirstOrderRecurrence(PHINode *Phi);
591 
592   /// Fix a reduction cross-iteration phi. This is the second phase of
593   /// vectorizing this phi node.
594   void fixReduction(PHINode *Phi);
595 
596   /// Clear NSW/NUW flags from reduction instructions if necessary.
597   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
598 
599   /// The Loop exit block may have single value PHI nodes with some
600   /// incoming value. While vectorizing we only handled real values
601   /// that were defined inside the loop and we should have one value for
602   /// each predecessor of its parent basic block. See PR14725.
603   void fixLCSSAPHIs();
604 
605   /// Iteratively sink the scalarized operands of a predicated instruction into
606   /// the block that was created for it.
607   void sinkScalarOperands(Instruction *PredInst);
608 
609   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
610   /// represented as.
611   void truncateToMinimalBitwidths();
612 
613   /// Create a broadcast instruction. This method generates a broadcast
614   /// instruction (shuffle) for loop invariant values and for the induction
615   /// value. If this is the induction variable then we extend it to N, N+1, ...
616   /// this is needed because each iteration in the loop corresponds to a SIMD
617   /// element.
618   virtual Value *getBroadcastInstrs(Value *V);
619 
620   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
621   /// to each vector element of Val. The sequence starts at StartIndex.
622   /// \p Opcode is relevant for FP induction variable.
623   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
624                                Instruction::BinaryOps Opcode =
625                                Instruction::BinaryOpsEnd);
626 
627   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
628   /// variable on which to base the steps, \p Step is the size of the step, and
629   /// \p EntryVal is the value from the original loop that maps to the steps.
630   /// Note that \p EntryVal doesn't have to be an induction variable - it
631   /// can also be a truncate instruction.
632   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
633                         const InductionDescriptor &ID);
634 
635   /// Create a vector induction phi node based on an existing scalar one. \p
636   /// EntryVal is the value from the original loop that maps to the vector phi
637   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
638   /// truncate instruction, instead of widening the original IV, we widen a
639   /// version of the IV truncated to \p EntryVal's type.
640   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
641                                        Value *Step, Instruction *EntryVal);
642 
643   /// Returns true if an instruction \p I should be scalarized instead of
644   /// vectorized for the chosen vectorization factor.
645   bool shouldScalarizeInstruction(Instruction *I) const;
646 
647   /// Returns true if we should generate a scalar version of \p IV.
648   bool needsScalarInduction(Instruction *IV) const;
649 
650   /// If there is a cast involved in the induction variable \p ID, which should
651   /// be ignored in the vectorized loop body, this function records the
652   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
653   /// cast. We had already proved that the casted Phi is equal to the uncasted
654   /// Phi in the vectorized loop (under a runtime guard), and therefore
655   /// there is no need to vectorize the cast - the same value can be used in the
656   /// vector loop for both the Phi and the cast.
657   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
658   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
659   ///
660   /// \p EntryVal is the value from the original loop that maps to the vector
661   /// phi node and is used to distinguish what is the IV currently being
662   /// processed - original one (if \p EntryVal is a phi corresponding to the
663   /// original IV) or the "newly-created" one based on the proof mentioned above
664   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
665   /// latter case \p EntryVal is a TruncInst and we must not record anything for
666   /// that IV, but it's error-prone to expect callers of this routine to care
667   /// about that, hence this explicit parameter.
668   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
669                                              const Instruction *EntryVal,
670                                              Value *VectorLoopValue,
671                                              unsigned Part,
672                                              unsigned Lane = UINT_MAX);
673 
674   /// Generate a shuffle sequence that will reverse the vector Vec.
675   virtual Value *reverseVector(Value *Vec);
676 
677   /// Returns (and creates if needed) the original loop trip count.
678   Value *getOrCreateTripCount(Loop *NewLoop);
679 
680   /// Returns (and creates if needed) the trip count of the widened loop.
681   Value *getOrCreateVectorTripCount(Loop *NewLoop);
682 
683   /// Returns a bitcasted value to the requested vector type.
684   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
685   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
686                                 const DataLayout &DL);
687 
688   /// Emit a bypass check to see if the vector trip count is zero, including if
689   /// it overflows.
690   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
691 
692   /// Emit a bypass check to see if all of the SCEV assumptions we've
693   /// had to make are correct.
694   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
695 
696   /// Emit bypass checks to check any memory assumptions we may have made.
697   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
698 
699   /// Compute the transformed value of Index at offset StartValue using step
700   /// StepValue.
701   /// For integer induction, returns StartValue + Index * StepValue.
702   /// For pointer induction, returns StartValue[Index * StepValue].
703   /// FIXME: The newly created binary instructions should contain nsw/nuw
704   /// flags, which can be found from the original scalar operations.
705   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
706                               const DataLayout &DL,
707                               const InductionDescriptor &ID) const;
708 
709   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
710   /// vector loop preheader, middle block and scalar preheader. Also
711   /// allocate a loop object for the new vector loop and return it.
712   Loop *createVectorLoopSkeleton(StringRef Prefix);
713 
714   /// Create new phi nodes for the induction variables to resume iteration count
715   /// in the scalar epilogue, from where the vectorized loop left off (given by
716   /// \p VectorTripCount).
717   void createInductionResumeValues(Loop *L, Value *VectorTripCount);
718 
719   /// Complete the loop skeleton by adding debug MDs, creating appropriate
720   /// conditional branches in the middle block, preparing the builder and
721   /// running the verifier. Take in the vector loop \p L as argument, and return
722   /// the preheader of the completed vector loop.
723   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
724 
725   /// Add additional metadata to \p To that was not present on \p Orig.
726   ///
727   /// Currently this is used to add the noalias annotations based on the
728   /// inserted memchecks.  Use this for instructions that are *cloned* into the
729   /// vector loop.
730   void addNewMetadata(Instruction *To, const Instruction *Orig);
731 
732   /// Add metadata from one instruction to another.
733   ///
734   /// This includes both the original MDs from \p From and additional ones (\see
735   /// addNewMetadata).  Use this for *newly created* instructions in the vector
736   /// loop.
737   void addMetadata(Instruction *To, Instruction *From);
738 
739   /// Similar to the previous function but it adds the metadata to a
740   /// vector of instructions.
741   void addMetadata(ArrayRef<Value *> To, Instruction *From);
742 
743   /// The original loop.
744   Loop *OrigLoop;
745 
746   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
747   /// dynamic knowledge to simplify SCEV expressions and converts them to a
748   /// more usable form.
749   PredicatedScalarEvolution &PSE;
750 
751   /// Loop Info.
752   LoopInfo *LI;
753 
754   /// Dominator Tree.
755   DominatorTree *DT;
756 
757   /// Alias Analysis.
758   AAResults *AA;
759 
760   /// Target Library Info.
761   const TargetLibraryInfo *TLI;
762 
763   /// Target Transform Info.
764   const TargetTransformInfo *TTI;
765 
766   /// Assumption Cache.
767   AssumptionCache *AC;
768 
769   /// Interface to emit optimization remarks.
770   OptimizationRemarkEmitter *ORE;
771 
772   /// LoopVersioning.  It's only set up (non-null) if memchecks were
773   /// used.
774   ///
775   /// This is currently only used to add no-alias metadata based on the
776   /// memchecks.  The actually versioning is performed manually.
777   std::unique_ptr<LoopVersioning> LVer;
778 
779   /// The vectorization SIMD factor to use. Each vector will have this many
780   /// vector elements.
781   ElementCount VF;
782 
783   /// The vectorization unroll factor to use. Each scalar is vectorized to this
784   /// many different vector instructions.
785   unsigned UF;
786 
787   /// The builder that we use
788   IRBuilder<> Builder;
789 
790   // --- Vectorization state ---
791 
792   /// The vector-loop preheader.
793   BasicBlock *LoopVectorPreHeader;
794 
795   /// The scalar-loop preheader.
796   BasicBlock *LoopScalarPreHeader;
797 
798   /// Middle Block between the vector and the scalar.
799   BasicBlock *LoopMiddleBlock;
800 
801   /// The ExitBlock of the scalar loop.
802   BasicBlock *LoopExitBlock;
803 
804   /// The vector loop body.
805   BasicBlock *LoopVectorBody;
806 
807   /// The scalar loop body.
808   BasicBlock *LoopScalarBody;
809 
810   /// A list of all bypass blocks. The first block is the entry of the loop.
811   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
812 
813   /// The new Induction variable which was added to the new block.
814   PHINode *Induction = nullptr;
815 
816   /// The induction variable of the old basic block.
817   PHINode *OldInduction = nullptr;
818 
819   /// Maps values from the original loop to their corresponding values in the
820   /// vectorized loop. A key value can map to either vector values, scalar
821   /// values or both kinds of values, depending on whether the key was
822   /// vectorized and scalarized.
823   VectorizerValueMap VectorLoopValueMap;
824 
825   /// Store instructions that were predicated.
826   SmallVector<Instruction *, 4> PredicatedInstructions;
827 
828   /// Trip count of the original loop.
829   Value *TripCount = nullptr;
830 
831   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
832   Value *VectorTripCount = nullptr;
833 
834   /// The legality analysis.
835   LoopVectorizationLegality *Legal;
836 
837   /// The profitablity analysis.
838   LoopVectorizationCostModel *Cost;
839 
840   // Record whether runtime checks are added.
841   bool AddedSafetyChecks = false;
842 
843   // Holds the end values for each induction variable. We save the end values
844   // so we can later fix-up the external users of the induction variables.
845   DenseMap<PHINode *, Value *> IVEndValues;
846 
847   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
848   // fixed up at the end of vector code generation.
849   SmallVector<PHINode *, 8> OrigPHIsToFix;
850 
851   /// BFI and PSI are used to check for profile guided size optimizations.
852   BlockFrequencyInfo *BFI;
853   ProfileSummaryInfo *PSI;
854 
855   // Whether this loop should be optimized for size based on profile guided size
856   // optimizatios.
857   bool OptForSizeBasedOnProfile;
858 };
859 
860 class InnerLoopUnroller : public InnerLoopVectorizer {
861 public:
862   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
863                     LoopInfo *LI, DominatorTree *DT,
864                     const TargetLibraryInfo *TLI,
865                     const TargetTransformInfo *TTI, AssumptionCache *AC,
866                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
867                     LoopVectorizationLegality *LVL,
868                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
869                     ProfileSummaryInfo *PSI)
870       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
871                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
872                             BFI, PSI) {}
873 
874 private:
875   Value *getBroadcastInstrs(Value *V) override;
876   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
877                        Instruction::BinaryOps Opcode =
878                        Instruction::BinaryOpsEnd) override;
879   Value *reverseVector(Value *Vec) override;
880 };
881 
882 } // end namespace llvm
883 
884 /// Look for a meaningful debug location on the instruction or it's
885 /// operands.
886 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
887   if (!I)
888     return I;
889 
890   DebugLoc Empty;
891   if (I->getDebugLoc() != Empty)
892     return I;
893 
894   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
895     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
896       if (OpInst->getDebugLoc() != Empty)
897         return OpInst;
898   }
899 
900   return I;
901 }
902 
903 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
904   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
905     const DILocation *DIL = Inst->getDebugLoc();
906     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
907         !isa<DbgInfoIntrinsic>(Inst)) {
908       assert(!VF.isScalable() && "scalable vectors not yet supported.");
909       auto NewDIL =
910           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
911       if (NewDIL)
912         B.SetCurrentDebugLocation(NewDIL.getValue());
913       else
914         LLVM_DEBUG(dbgs()
915                    << "Failed to create new discriminator: "
916                    << DIL->getFilename() << " Line: " << DIL->getLine());
917     }
918     else
919       B.SetCurrentDebugLocation(DIL);
920   } else
921     B.SetCurrentDebugLocation(DebugLoc());
922 }
923 
924 /// Write a record \p DebugMsg about vectorization failure to the debug
925 /// output stream. If \p I is passed, it is an instruction that prevents
926 /// vectorization.
927 #ifndef NDEBUG
928 static void debugVectorizationFailure(const StringRef DebugMsg,
929     Instruction *I) {
930   dbgs() << "LV: Not vectorizing: " << DebugMsg;
931   if (I != nullptr)
932     dbgs() << " " << *I;
933   else
934     dbgs() << '.';
935   dbgs() << '\n';
936 }
937 #endif
938 
939 /// Create an analysis remark that explains why vectorization failed
940 ///
941 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
942 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
943 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
944 /// the location of the remark.  \return the remark object that can be
945 /// streamed to.
946 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
947     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
948   Value *CodeRegion = TheLoop->getHeader();
949   DebugLoc DL = TheLoop->getStartLoc();
950 
951   if (I) {
952     CodeRegion = I->getParent();
953     // If there is no debug location attached to the instruction, revert back to
954     // using the loop's.
955     if (I->getDebugLoc())
956       DL = I->getDebugLoc();
957   }
958 
959   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
960   R << "loop not vectorized: ";
961   return R;
962 }
963 
964 namespace llvm {
965 
966 void reportVectorizationFailure(const StringRef DebugMsg,
967     const StringRef OREMsg, const StringRef ORETag,
968     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
969   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
970   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
971   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
972                 ORETag, TheLoop, I) << OREMsg);
973 }
974 
975 } // end namespace llvm
976 
977 #ifndef NDEBUG
978 /// \return string containing a file name and a line # for the given loop.
979 static std::string getDebugLocString(const Loop *L) {
980   std::string Result;
981   if (L) {
982     raw_string_ostream OS(Result);
983     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
984       LoopDbgLoc.print(OS);
985     else
986       // Just print the module name.
987       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
988     OS.flush();
989   }
990   return Result;
991 }
992 #endif
993 
994 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
995                                          const Instruction *Orig) {
996   // If the loop was versioned with memchecks, add the corresponding no-alias
997   // metadata.
998   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
999     LVer->annotateInstWithNoAlias(To, Orig);
1000 }
1001 
1002 void InnerLoopVectorizer::addMetadata(Instruction *To,
1003                                       Instruction *From) {
1004   propagateMetadata(To, From);
1005   addNewMetadata(To, From);
1006 }
1007 
1008 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1009                                       Instruction *From) {
1010   for (Value *V : To) {
1011     if (Instruction *I = dyn_cast<Instruction>(V))
1012       addMetadata(I, From);
1013   }
1014 }
1015 
1016 namespace llvm {
1017 
1018 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1019 // lowered.
1020 enum ScalarEpilogueLowering {
1021 
1022   // The default: allowing scalar epilogues.
1023   CM_ScalarEpilogueAllowed,
1024 
1025   // Vectorization with OptForSize: don't allow epilogues.
1026   CM_ScalarEpilogueNotAllowedOptSize,
1027 
1028   // A special case of vectorisation with OptForSize: loops with a very small
1029   // trip count are considered for vectorization under OptForSize, thereby
1030   // making sure the cost of their loop body is dominant, free of runtime
1031   // guards and scalar iteration overheads.
1032   CM_ScalarEpilogueNotAllowedLowTripLoop,
1033 
1034   // Loop hint predicate indicating an epilogue is undesired.
1035   CM_ScalarEpilogueNotNeededUsePredicate
1036 };
1037 
1038 /// LoopVectorizationCostModel - estimates the expected speedups due to
1039 /// vectorization.
1040 /// In many cases vectorization is not profitable. This can happen because of
1041 /// a number of reasons. In this class we mainly attempt to predict the
1042 /// expected speedup/slowdowns due to the supported instruction set. We use the
1043 /// TargetTransformInfo to query the different backends for the cost of
1044 /// different operations.
1045 class LoopVectorizationCostModel {
1046 public:
1047   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1048                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1049                              LoopVectorizationLegality *Legal,
1050                              const TargetTransformInfo &TTI,
1051                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1052                              AssumptionCache *AC,
1053                              OptimizationRemarkEmitter *ORE, const Function *F,
1054                              const LoopVectorizeHints *Hints,
1055                              InterleavedAccessInfo &IAI)
1056       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1057         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1058         Hints(Hints), InterleaveInfo(IAI) {}
1059 
1060   /// \return An upper bound for the vectorization factor, or None if
1061   /// vectorization and interleaving should be avoided up front.
1062   Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
1063 
1064   /// \return True if runtime checks are required for vectorization, and false
1065   /// otherwise.
1066   bool runtimeChecksRequired();
1067 
1068   /// \return The most profitable vectorization factor and the cost of that VF.
1069   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1070   /// then this vectorization factor will be selected if vectorization is
1071   /// possible.
1072   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
1073 
1074   /// Setup cost-based decisions for user vectorization factor.
1075   void selectUserVectorizationFactor(ElementCount UserVF) {
1076     collectUniformsAndScalars(UserVF);
1077     collectInstsToScalarize(UserVF);
1078   }
1079 
1080   /// \return The size (in bits) of the smallest and widest types in the code
1081   /// that needs to be vectorized. We ignore values that remain scalar such as
1082   /// 64 bit loop indices.
1083   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1084 
1085   /// \return The desired interleave count.
1086   /// If interleave count has been specified by metadata it will be returned.
1087   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1088   /// are the selected vectorization factor and the cost of the selected VF.
1089   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1090 
1091   /// Memory access instruction may be vectorized in more than one way.
1092   /// Form of instruction after vectorization depends on cost.
1093   /// This function takes cost-based decisions for Load/Store instructions
1094   /// and collects them in a map. This decisions map is used for building
1095   /// the lists of loop-uniform and loop-scalar instructions.
1096   /// The calculated cost is saved with widening decision in order to
1097   /// avoid redundant calculations.
1098   void setCostBasedWideningDecision(ElementCount VF);
1099 
1100   /// A struct that represents some properties of the register usage
1101   /// of a loop.
1102   struct RegisterUsage {
1103     /// Holds the number of loop invariant values that are used in the loop.
1104     /// The key is ClassID of target-provided register class.
1105     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1106     /// Holds the maximum number of concurrent live intervals in the loop.
1107     /// The key is ClassID of target-provided register class.
1108     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1109   };
1110 
1111   /// \return Returns information about the register usages of the loop for the
1112   /// given vectorization factors.
1113   SmallVector<RegisterUsage, 8>
1114   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1115 
1116   /// Collect values we want to ignore in the cost model.
1117   void collectValuesToIgnore();
1118 
1119   /// Split reductions into those that happen in the loop, and those that happen
1120   /// outside. In loop reductions are collected into InLoopReductionChains.
1121   void collectInLoopReductions();
1122 
1123   /// \returns The smallest bitwidth each instruction can be represented with.
1124   /// The vector equivalents of these instructions should be truncated to this
1125   /// type.
1126   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1127     return MinBWs;
1128   }
1129 
1130   /// \returns True if it is more profitable to scalarize instruction \p I for
1131   /// vectorization factor \p VF.
1132   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1133     assert(VF.isVector() &&
1134            "Profitable to scalarize relevant only for VF > 1.");
1135 
1136     // Cost model is not run in the VPlan-native path - return conservative
1137     // result until this changes.
1138     if (EnableVPlanNativePath)
1139       return false;
1140 
1141     auto Scalars = InstsToScalarize.find(VF);
1142     assert(Scalars != InstsToScalarize.end() &&
1143            "VF not yet analyzed for scalarization profitability");
1144     return Scalars->second.find(I) != Scalars->second.end();
1145   }
1146 
1147   /// Returns true if \p I is known to be uniform after vectorization.
1148   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1149     if (VF.isScalar())
1150       return true;
1151 
1152     // Cost model is not run in the VPlan-native path - return conservative
1153     // result until this changes.
1154     if (EnableVPlanNativePath)
1155       return false;
1156 
1157     auto UniformsPerVF = Uniforms.find(VF);
1158     assert(UniformsPerVF != Uniforms.end() &&
1159            "VF not yet analyzed for uniformity");
1160     return UniformsPerVF->second.count(I);
1161   }
1162 
1163   /// Returns true if \p I is known to be scalar after vectorization.
1164   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1165     if (VF.isScalar())
1166       return true;
1167 
1168     // Cost model is not run in the VPlan-native path - return conservative
1169     // result until this changes.
1170     if (EnableVPlanNativePath)
1171       return false;
1172 
1173     auto ScalarsPerVF = Scalars.find(VF);
1174     assert(ScalarsPerVF != Scalars.end() &&
1175            "Scalar values are not calculated for VF");
1176     return ScalarsPerVF->second.count(I);
1177   }
1178 
1179   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1180   /// for vectorization factor \p VF.
1181   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1182     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1183            !isProfitableToScalarize(I, VF) &&
1184            !isScalarAfterVectorization(I, VF);
1185   }
1186 
1187   /// Decision that was taken during cost calculation for memory instruction.
1188   enum InstWidening {
1189     CM_Unknown,
1190     CM_Widen,         // For consecutive accesses with stride +1.
1191     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1192     CM_Interleave,
1193     CM_GatherScatter,
1194     CM_Scalarize
1195   };
1196 
1197   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1198   /// instruction \p I and vector width \p VF.
1199   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1200                            unsigned Cost) {
1201     assert(VF.isVector() && "Expected VF >=2");
1202     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1203   }
1204 
1205   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1206   /// interleaving group \p Grp and vector width \p VF.
1207   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1208                            ElementCount VF, InstWidening W, unsigned Cost) {
1209     assert(VF.isVector() && "Expected VF >=2");
1210     /// Broadcast this decicion to all instructions inside the group.
1211     /// But the cost will be assigned to one instruction only.
1212     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1213       if (auto *I = Grp->getMember(i)) {
1214         if (Grp->getInsertPos() == I)
1215           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1216         else
1217           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1218       }
1219     }
1220   }
1221 
1222   /// Return the cost model decision for the given instruction \p I and vector
1223   /// width \p VF. Return CM_Unknown if this instruction did not pass
1224   /// through the cost modeling.
1225   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1226     assert(!VF.isScalable() && "scalable vectors not yet supported.");
1227     assert(VF.isVector() && "Expected VF >=2");
1228 
1229     // Cost model is not run in the VPlan-native path - return conservative
1230     // result until this changes.
1231     if (EnableVPlanNativePath)
1232       return CM_GatherScatter;
1233 
1234     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1235     auto Itr = WideningDecisions.find(InstOnVF);
1236     if (Itr == WideningDecisions.end())
1237       return CM_Unknown;
1238     return Itr->second.first;
1239   }
1240 
1241   /// Return the vectorization cost for the given instruction \p I and vector
1242   /// width \p VF.
1243   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1244     assert(VF.isVector() && "Expected VF >=2");
1245     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1246     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1247            "The cost is not calculated");
1248     return WideningDecisions[InstOnVF].second;
1249   }
1250 
1251   /// Return True if instruction \p I is an optimizable truncate whose operand
1252   /// is an induction variable. Such a truncate will be removed by adding a new
1253   /// induction variable with the destination type.
1254   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1255     // If the instruction is not a truncate, return false.
1256     auto *Trunc = dyn_cast<TruncInst>(I);
1257     if (!Trunc)
1258       return false;
1259 
1260     // Get the source and destination types of the truncate.
1261     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1262     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1263 
1264     // If the truncate is free for the given types, return false. Replacing a
1265     // free truncate with an induction variable would add an induction variable
1266     // update instruction to each iteration of the loop. We exclude from this
1267     // check the primary induction variable since it will need an update
1268     // instruction regardless.
1269     Value *Op = Trunc->getOperand(0);
1270     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1271       return false;
1272 
1273     // If the truncated value is not an induction variable, return false.
1274     return Legal->isInductionPhi(Op);
1275   }
1276 
1277   /// Collects the instructions to scalarize for each predicated instruction in
1278   /// the loop.
1279   void collectInstsToScalarize(ElementCount VF);
1280 
1281   /// Collect Uniform and Scalar values for the given \p VF.
1282   /// The sets depend on CM decision for Load/Store instructions
1283   /// that may be vectorized as interleave, gather-scatter or scalarized.
1284   void collectUniformsAndScalars(ElementCount VF) {
1285     // Do the analysis once.
1286     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1287       return;
1288     setCostBasedWideningDecision(VF);
1289     collectLoopUniforms(VF);
1290     collectLoopScalars(VF);
1291   }
1292 
1293   /// Returns true if the target machine supports masked store operation
1294   /// for the given \p DataType and kind of access to \p Ptr.
1295   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1296     return Legal->isConsecutivePtr(Ptr) &&
1297            TTI.isLegalMaskedStore(DataType, Alignment);
1298   }
1299 
1300   /// Returns true if the target machine supports masked load operation
1301   /// for the given \p DataType and kind of access to \p Ptr.
1302   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1303     return Legal->isConsecutivePtr(Ptr) &&
1304            TTI.isLegalMaskedLoad(DataType, Alignment);
1305   }
1306 
1307   /// Returns true if the target machine supports masked scatter operation
1308   /// for the given \p DataType.
1309   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1310     return TTI.isLegalMaskedScatter(DataType, Alignment);
1311   }
1312 
1313   /// Returns true if the target machine supports masked gather operation
1314   /// for the given \p DataType.
1315   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1316     return TTI.isLegalMaskedGather(DataType, Alignment);
1317   }
1318 
1319   /// Returns true if the target machine can represent \p V as a masked gather
1320   /// or scatter operation.
1321   bool isLegalGatherOrScatter(Value *V) {
1322     bool LI = isa<LoadInst>(V);
1323     bool SI = isa<StoreInst>(V);
1324     if (!LI && !SI)
1325       return false;
1326     auto *Ty = getMemInstValueType(V);
1327     Align Align = getLoadStoreAlignment(V);
1328     return (LI && isLegalMaskedGather(Ty, Align)) ||
1329            (SI && isLegalMaskedScatter(Ty, Align));
1330   }
1331 
1332   /// Returns true if \p I is an instruction that will be scalarized with
1333   /// predication. Such instructions include conditional stores and
1334   /// instructions that may divide by zero.
1335   /// If a non-zero VF has been calculated, we check if I will be scalarized
1336   /// predication for that VF.
1337   bool isScalarWithPredication(Instruction *I,
1338                                ElementCount VF = ElementCount::getFixed(1));
1339 
1340   // Returns true if \p I is an instruction that will be predicated either
1341   // through scalar predication or masked load/store or masked gather/scatter.
1342   // Superset of instructions that return true for isScalarWithPredication.
1343   bool isPredicatedInst(Instruction *I) {
1344     if (!blockNeedsPredication(I->getParent()))
1345       return false;
1346     // Loads and stores that need some form of masked operation are predicated
1347     // instructions.
1348     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1349       return Legal->isMaskRequired(I);
1350     return isScalarWithPredication(I);
1351   }
1352 
1353   /// Returns true if \p I is a memory instruction with consecutive memory
1354   /// access that can be widened.
1355   bool
1356   memoryInstructionCanBeWidened(Instruction *I,
1357                                 ElementCount VF = ElementCount::getFixed(1));
1358 
1359   /// Returns true if \p I is a memory instruction in an interleaved-group
1360   /// of memory accesses that can be vectorized with wide vector loads/stores
1361   /// and shuffles.
1362   bool
1363   interleavedAccessCanBeWidened(Instruction *I,
1364                                 ElementCount VF = ElementCount::getFixed(1));
1365 
1366   /// Check if \p Instr belongs to any interleaved access group.
1367   bool isAccessInterleaved(Instruction *Instr) {
1368     return InterleaveInfo.isInterleaved(Instr);
1369   }
1370 
1371   /// Get the interleaved access group that \p Instr belongs to.
1372   const InterleaveGroup<Instruction> *
1373   getInterleavedAccessGroup(Instruction *Instr) {
1374     return InterleaveInfo.getInterleaveGroup(Instr);
1375   }
1376 
1377   /// Returns true if an interleaved group requires a scalar iteration
1378   /// to handle accesses with gaps, and there is nothing preventing us from
1379   /// creating a scalar epilogue.
1380   bool requiresScalarEpilogue() const {
1381     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1382   }
1383 
1384   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1385   /// loop hint annotation.
1386   bool isScalarEpilogueAllowed() const {
1387     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1388   }
1389 
1390   /// Returns true if all loop blocks should be masked to fold tail loop.
1391   bool foldTailByMasking() const { return FoldTailByMasking; }
1392 
1393   bool blockNeedsPredication(BasicBlock *BB) {
1394     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1395   }
1396 
1397   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1398   /// nodes to the chain of instructions representing the reductions. Uses a
1399   /// MapVector to ensure deterministic iteration order.
1400   using ReductionChainMap =
1401       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1402 
1403   /// Return the chain of instructions representing an inloop reduction.
1404   const ReductionChainMap &getInLoopReductionChains() const {
1405     return InLoopReductionChains;
1406   }
1407 
1408   /// Returns true if the Phi is part of an inloop reduction.
1409   bool isInLoopReduction(PHINode *Phi) const {
1410     return InLoopReductionChains.count(Phi);
1411   }
1412 
1413   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1414   /// with factor VF.  Return the cost of the instruction, including
1415   /// scalarization overhead if it's needed.
1416   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1417 
1418   /// Estimate cost of a call instruction CI if it were vectorized with factor
1419   /// VF. Return the cost of the instruction, including scalarization overhead
1420   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1421   /// scalarized -
1422   /// i.e. either vector version isn't available, or is too expensive.
1423   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1424                              bool &NeedToScalarize);
1425 
1426   /// Invalidates decisions already taken by the cost model.
1427   void invalidateCostModelingDecisions() {
1428     WideningDecisions.clear();
1429     Uniforms.clear();
1430     Scalars.clear();
1431   }
1432 
1433 private:
1434   unsigned NumPredStores = 0;
1435 
1436   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1437   /// than zero. One is returned if vectorization should best be avoided due
1438   /// to cost.
1439   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1440 
1441   /// The vectorization cost is a combination of the cost itself and a boolean
1442   /// indicating whether any of the contributing operations will actually
1443   /// operate on
1444   /// vector values after type legalization in the backend. If this latter value
1445   /// is
1446   /// false, then all operations will be scalarized (i.e. no vectorization has
1447   /// actually taken place).
1448   using VectorizationCostTy = std::pair<unsigned, bool>;
1449 
1450   /// Returns the expected execution cost. The unit of the cost does
1451   /// not matter because we use the 'cost' units to compare different
1452   /// vector widths. The cost that is returned is *not* normalized by
1453   /// the factor width.
1454   VectorizationCostTy expectedCost(ElementCount VF);
1455 
1456   /// Returns the execution time cost of an instruction for a given vector
1457   /// width. Vector width of one means scalar.
1458   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1459 
1460   /// The cost-computation logic from getInstructionCost which provides
1461   /// the vector type as an output parameter.
1462   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1463 
1464   /// Calculate vectorization cost of memory instruction \p I.
1465   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1466 
1467   /// The cost computation for scalarized memory instruction.
1468   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1469 
1470   /// The cost computation for interleaving group of memory instructions.
1471   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1472 
1473   /// The cost computation for Gather/Scatter instruction.
1474   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1475 
1476   /// The cost computation for widening instruction \p I with consecutive
1477   /// memory access.
1478   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1479 
1480   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1481   /// Load: scalar load + broadcast.
1482   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1483   /// element)
1484   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1485 
1486   /// Estimate the overhead of scalarizing an instruction. This is a
1487   /// convenience wrapper for the type-based getScalarizationOverhead API.
1488   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1489 
1490   /// Returns whether the instruction is a load or store and will be a emitted
1491   /// as a vector operation.
1492   bool isConsecutiveLoadOrStore(Instruction *I);
1493 
1494   /// Returns true if an artificially high cost for emulated masked memrefs
1495   /// should be used.
1496   bool useEmulatedMaskMemRefHack(Instruction *I);
1497 
1498   /// Map of scalar integer values to the smallest bitwidth they can be legally
1499   /// represented as. The vector equivalents of these values should be truncated
1500   /// to this type.
1501   MapVector<Instruction *, uint64_t> MinBWs;
1502 
1503   /// A type representing the costs for instructions if they were to be
1504   /// scalarized rather than vectorized. The entries are Instruction-Cost
1505   /// pairs.
1506   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1507 
1508   /// A set containing all BasicBlocks that are known to present after
1509   /// vectorization as a predicated block.
1510   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1511 
1512   /// Records whether it is allowed to have the original scalar loop execute at
1513   /// least once. This may be needed as a fallback loop in case runtime
1514   /// aliasing/dependence checks fail, or to handle the tail/remainder
1515   /// iterations when the trip count is unknown or doesn't divide by the VF,
1516   /// or as a peel-loop to handle gaps in interleave-groups.
1517   /// Under optsize and when the trip count is very small we don't allow any
1518   /// iterations to execute in the scalar loop.
1519   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1520 
1521   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1522   bool FoldTailByMasking = false;
1523 
1524   /// A map holding scalar costs for different vectorization factors. The
1525   /// presence of a cost for an instruction in the mapping indicates that the
1526   /// instruction will be scalarized when vectorizing with the associated
1527   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1528   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1529 
1530   /// Holds the instructions known to be uniform after vectorization.
1531   /// The data is collected per VF.
1532   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1533 
1534   /// Holds the instructions known to be scalar after vectorization.
1535   /// The data is collected per VF.
1536   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1537 
1538   /// Holds the instructions (address computations) that are forced to be
1539   /// scalarized.
1540   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1541 
1542   /// PHINodes of the reductions that should be expanded in-loop along with
1543   /// their associated chains of reduction operations, in program order from top
1544   /// (PHI) to bottom
1545   ReductionChainMap InLoopReductionChains;
1546 
1547   /// Returns the expected difference in cost from scalarizing the expression
1548   /// feeding a predicated instruction \p PredInst. The instructions to
1549   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1550   /// non-negative return value implies the expression will be scalarized.
1551   /// Currently, only single-use chains are considered for scalarization.
1552   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1553                               ElementCount VF);
1554 
1555   /// Collect the instructions that are uniform after vectorization. An
1556   /// instruction is uniform if we represent it with a single scalar value in
1557   /// the vectorized loop corresponding to each vector iteration. Examples of
1558   /// uniform instructions include pointer operands of consecutive or
1559   /// interleaved memory accesses. Note that although uniformity implies an
1560   /// instruction will be scalar, the reverse is not true. In general, a
1561   /// scalarized instruction will be represented by VF scalar values in the
1562   /// vectorized loop, each corresponding to an iteration of the original
1563   /// scalar loop.
1564   void collectLoopUniforms(ElementCount VF);
1565 
1566   /// Collect the instructions that are scalar after vectorization. An
1567   /// instruction is scalar if it is known to be uniform or will be scalarized
1568   /// during vectorization. Non-uniform scalarized instructions will be
1569   /// represented by VF values in the vectorized loop, each corresponding to an
1570   /// iteration of the original scalar loop.
1571   void collectLoopScalars(ElementCount VF);
1572 
1573   /// Keeps cost model vectorization decision and cost for instructions.
1574   /// Right now it is used for memory instructions only.
1575   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1576                                 std::pair<InstWidening, unsigned>>;
1577 
1578   DecisionList WideningDecisions;
1579 
1580   /// Returns true if \p V is expected to be vectorized and it needs to be
1581   /// extracted.
1582   bool needsExtract(Value *V, ElementCount VF) const {
1583     Instruction *I = dyn_cast<Instruction>(V);
1584     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1585         TheLoop->isLoopInvariant(I))
1586       return false;
1587 
1588     // Assume we can vectorize V (and hence we need extraction) if the
1589     // scalars are not computed yet. This can happen, because it is called
1590     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1591     // the scalars are collected. That should be a safe assumption in most
1592     // cases, because we check if the operands have vectorizable types
1593     // beforehand in LoopVectorizationLegality.
1594     return Scalars.find(VF) == Scalars.end() ||
1595            !isScalarAfterVectorization(I, VF);
1596   };
1597 
1598   /// Returns a range containing only operands needing to be extracted.
1599   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1600                                                    ElementCount VF) {
1601     return SmallVector<Value *, 4>(make_filter_range(
1602         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1603   }
1604 
1605 public:
1606   /// The loop that we evaluate.
1607   Loop *TheLoop;
1608 
1609   /// Predicated scalar evolution analysis.
1610   PredicatedScalarEvolution &PSE;
1611 
1612   /// Loop Info analysis.
1613   LoopInfo *LI;
1614 
1615   /// Vectorization legality.
1616   LoopVectorizationLegality *Legal;
1617 
1618   /// Vector target information.
1619   const TargetTransformInfo &TTI;
1620 
1621   /// Target Library Info.
1622   const TargetLibraryInfo *TLI;
1623 
1624   /// Demanded bits analysis.
1625   DemandedBits *DB;
1626 
1627   /// Assumption cache.
1628   AssumptionCache *AC;
1629 
1630   /// Interface to emit optimization remarks.
1631   OptimizationRemarkEmitter *ORE;
1632 
1633   const Function *TheFunction;
1634 
1635   /// Loop Vectorize Hint.
1636   const LoopVectorizeHints *Hints;
1637 
1638   /// The interleave access information contains groups of interleaved accesses
1639   /// with the same stride and close to each other.
1640   InterleavedAccessInfo &InterleaveInfo;
1641 
1642   /// Values to ignore in the cost model.
1643   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1644 
1645   /// Values to ignore in the cost model when VF > 1.
1646   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1647 };
1648 
1649 } // end namespace llvm
1650 
1651 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1652 // vectorization. The loop needs to be annotated with #pragma omp simd
1653 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1654 // vector length information is not provided, vectorization is not considered
1655 // explicit. Interleave hints are not allowed either. These limitations will be
1656 // relaxed in the future.
1657 // Please, note that we are currently forced to abuse the pragma 'clang
1658 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1659 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1660 // provides *explicit vectorization hints* (LV can bypass legal checks and
1661 // assume that vectorization is legal). However, both hints are implemented
1662 // using the same metadata (llvm.loop.vectorize, processed by
1663 // LoopVectorizeHints). This will be fixed in the future when the native IR
1664 // representation for pragma 'omp simd' is introduced.
1665 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1666                                    OptimizationRemarkEmitter *ORE) {
1667   assert(!OuterLp->isInnermost() && "This is not an outer loop");
1668   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1669 
1670   // Only outer loops with an explicit vectorization hint are supported.
1671   // Unannotated outer loops are ignored.
1672   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1673     return false;
1674 
1675   Function *Fn = OuterLp->getHeader()->getParent();
1676   if (!Hints.allowVectorization(Fn, OuterLp,
1677                                 true /*VectorizeOnlyWhenForced*/)) {
1678     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1679     return false;
1680   }
1681 
1682   if (Hints.getInterleave() > 1) {
1683     // TODO: Interleave support is future work.
1684     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1685                          "outer loops.\n");
1686     Hints.emitRemarkWithHints();
1687     return false;
1688   }
1689 
1690   return true;
1691 }
1692 
1693 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1694                                   OptimizationRemarkEmitter *ORE,
1695                                   SmallVectorImpl<Loop *> &V) {
1696   // Collect inner loops and outer loops without irreducible control flow. For
1697   // now, only collect outer loops that have explicit vectorization hints. If we
1698   // are stress testing the VPlan H-CFG construction, we collect the outermost
1699   // loop of every loop nest.
1700   if (L.isInnermost() || VPlanBuildStressTest ||
1701       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1702     LoopBlocksRPO RPOT(&L);
1703     RPOT.perform(LI);
1704     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1705       V.push_back(&L);
1706       // TODO: Collect inner loops inside marked outer loops in case
1707       // vectorization fails for the outer loop. Do not invoke
1708       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1709       // already known to be reducible. We can use an inherited attribute for
1710       // that.
1711       return;
1712     }
1713   }
1714   for (Loop *InnerL : L)
1715     collectSupportedLoops(*InnerL, LI, ORE, V);
1716 }
1717 
1718 namespace {
1719 
1720 /// The LoopVectorize Pass.
1721 struct LoopVectorize : public FunctionPass {
1722   /// Pass identification, replacement for typeid
1723   static char ID;
1724 
1725   LoopVectorizePass Impl;
1726 
1727   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1728                          bool VectorizeOnlyWhenForced = false)
1729       : FunctionPass(ID),
1730         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1731     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1732   }
1733 
1734   bool runOnFunction(Function &F) override {
1735     if (skipFunction(F))
1736       return false;
1737 
1738     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1739     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1740     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1741     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1742     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1743     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1744     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1745     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1746     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1747     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1748     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1749     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1750     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1751 
1752     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1753         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1754 
1755     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1756                         GetLAA, *ORE, PSI).MadeAnyChange;
1757   }
1758 
1759   void getAnalysisUsage(AnalysisUsage &AU) const override {
1760     AU.addRequired<AssumptionCacheTracker>();
1761     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1762     AU.addRequired<DominatorTreeWrapperPass>();
1763     AU.addRequired<LoopInfoWrapperPass>();
1764     AU.addRequired<ScalarEvolutionWrapperPass>();
1765     AU.addRequired<TargetTransformInfoWrapperPass>();
1766     AU.addRequired<AAResultsWrapperPass>();
1767     AU.addRequired<LoopAccessLegacyAnalysis>();
1768     AU.addRequired<DemandedBitsWrapperPass>();
1769     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1770     AU.addRequired<InjectTLIMappingsLegacy>();
1771 
1772     // We currently do not preserve loopinfo/dominator analyses with outer loop
1773     // vectorization. Until this is addressed, mark these analyses as preserved
1774     // only for non-VPlan-native path.
1775     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1776     if (!EnableVPlanNativePath) {
1777       AU.addPreserved<LoopInfoWrapperPass>();
1778       AU.addPreserved<DominatorTreeWrapperPass>();
1779     }
1780 
1781     AU.addPreserved<BasicAAWrapperPass>();
1782     AU.addPreserved<GlobalsAAWrapperPass>();
1783     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1784   }
1785 };
1786 
1787 } // end anonymous namespace
1788 
1789 //===----------------------------------------------------------------------===//
1790 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1791 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1792 //===----------------------------------------------------------------------===//
1793 
1794 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1795   // We need to place the broadcast of invariant variables outside the loop,
1796   // but only if it's proven safe to do so. Else, broadcast will be inside
1797   // vector loop body.
1798   Instruction *Instr = dyn_cast<Instruction>(V);
1799   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1800                      (!Instr ||
1801                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1802   // Place the code for broadcasting invariant variables in the new preheader.
1803   IRBuilder<>::InsertPointGuard Guard(Builder);
1804   if (SafeToHoist)
1805     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1806 
1807   // Broadcast the scalar into all locations in the vector.
1808   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1809 
1810   return Shuf;
1811 }
1812 
1813 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1814     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1815   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1816          "Expected either an induction phi-node or a truncate of it!");
1817   Value *Start = II.getStartValue();
1818 
1819   // Construct the initial value of the vector IV in the vector loop preheader
1820   auto CurrIP = Builder.saveIP();
1821   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1822   if (isa<TruncInst>(EntryVal)) {
1823     assert(Start->getType()->isIntegerTy() &&
1824            "Truncation requires an integer type");
1825     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1826     Step = Builder.CreateTrunc(Step, TruncType);
1827     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1828   }
1829   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1830   Value *SteppedStart =
1831       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1832 
1833   // We create vector phi nodes for both integer and floating-point induction
1834   // variables. Here, we determine the kind of arithmetic we will perform.
1835   Instruction::BinaryOps AddOp;
1836   Instruction::BinaryOps MulOp;
1837   if (Step->getType()->isIntegerTy()) {
1838     AddOp = Instruction::Add;
1839     MulOp = Instruction::Mul;
1840   } else {
1841     AddOp = II.getInductionOpcode();
1842     MulOp = Instruction::FMul;
1843   }
1844 
1845   // Multiply the vectorization factor by the step using integer or
1846   // floating-point arithmetic as appropriate.
1847   Value *ConstVF =
1848       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
1849   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1850 
1851   // Create a vector splat to use in the induction update.
1852   //
1853   // FIXME: If the step is non-constant, we create the vector splat with
1854   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1855   //        handle a constant vector splat.
1856   assert(!VF.isScalable() && "scalable vectors not yet supported.");
1857   Value *SplatVF = isa<Constant>(Mul)
1858                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1859                        : Builder.CreateVectorSplat(VF, Mul);
1860   Builder.restoreIP(CurrIP);
1861 
1862   // We may need to add the step a number of times, depending on the unroll
1863   // factor. The last of those goes into the PHI.
1864   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1865                                     &*LoopVectorBody->getFirstInsertionPt());
1866   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1867   Instruction *LastInduction = VecInd;
1868   for (unsigned Part = 0; Part < UF; ++Part) {
1869     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1870 
1871     if (isa<TruncInst>(EntryVal))
1872       addMetadata(LastInduction, EntryVal);
1873     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1874 
1875     LastInduction = cast<Instruction>(addFastMathFlag(
1876         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1877     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1878   }
1879 
1880   // Move the last step to the end of the latch block. This ensures consistent
1881   // placement of all induction updates.
1882   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1883   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1884   auto *ICmp = cast<Instruction>(Br->getCondition());
1885   LastInduction->moveBefore(ICmp);
1886   LastInduction->setName("vec.ind.next");
1887 
1888   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1889   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1890 }
1891 
1892 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1893   return Cost->isScalarAfterVectorization(I, VF) ||
1894          Cost->isProfitableToScalarize(I, VF);
1895 }
1896 
1897 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1898   if (shouldScalarizeInstruction(IV))
1899     return true;
1900   auto isScalarInst = [&](User *U) -> bool {
1901     auto *I = cast<Instruction>(U);
1902     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1903   };
1904   return llvm::any_of(IV->users(), isScalarInst);
1905 }
1906 
1907 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1908     const InductionDescriptor &ID, const Instruction *EntryVal,
1909     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1910   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1911          "Expected either an induction phi-node or a truncate of it!");
1912 
1913   // This induction variable is not the phi from the original loop but the
1914   // newly-created IV based on the proof that casted Phi is equal to the
1915   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1916   // re-uses the same InductionDescriptor that original IV uses but we don't
1917   // have to do any recording in this case - that is done when original IV is
1918   // processed.
1919   if (isa<TruncInst>(EntryVal))
1920     return;
1921 
1922   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1923   if (Casts.empty())
1924     return;
1925   // Only the first Cast instruction in the Casts vector is of interest.
1926   // The rest of the Casts (if exist) have no uses outside the
1927   // induction update chain itself.
1928   Instruction *CastInst = *Casts.begin();
1929   if (Lane < UINT_MAX)
1930     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1931   else
1932     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1933 }
1934 
1935 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1936   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1937          "Primary induction variable must have an integer type");
1938 
1939   auto II = Legal->getInductionVars().find(IV);
1940   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
1941 
1942   auto ID = II->second;
1943   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1944 
1945   // The value from the original loop to which we are mapping the new induction
1946   // variable.
1947   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1948 
1949   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1950 
1951   // Generate code for the induction step. Note that induction steps are
1952   // required to be loop-invariant
1953   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
1954     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
1955            "Induction step should be loop invariant");
1956     if (PSE.getSE()->isSCEVable(IV->getType())) {
1957       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1958       return Exp.expandCodeFor(Step, Step->getType(),
1959                                LoopVectorPreHeader->getTerminator());
1960     }
1961     return cast<SCEVUnknown>(Step)->getValue();
1962   };
1963 
1964   // The scalar value to broadcast. This is derived from the canonical
1965   // induction variable. If a truncation type is given, truncate the canonical
1966   // induction variable and step. Otherwise, derive these values from the
1967   // induction descriptor.
1968   auto CreateScalarIV = [&](Value *&Step) -> Value * {
1969     Value *ScalarIV = Induction;
1970     if (IV != OldInduction) {
1971       ScalarIV = IV->getType()->isIntegerTy()
1972                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1973                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1974                                           IV->getType());
1975       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1976       ScalarIV->setName("offset.idx");
1977     }
1978     if (Trunc) {
1979       auto *TruncType = cast<IntegerType>(Trunc->getType());
1980       assert(Step->getType()->isIntegerTy() &&
1981              "Truncation requires an integer step");
1982       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1983       Step = Builder.CreateTrunc(Step, TruncType);
1984     }
1985     return ScalarIV;
1986   };
1987 
1988   // Create the vector values from the scalar IV, in the absence of creating a
1989   // vector IV.
1990   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
1991     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1992     for (unsigned Part = 0; Part < UF; ++Part) {
1993       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1994       Value *EntryPart =
1995           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
1996                         ID.getInductionOpcode());
1997       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1998       if (Trunc)
1999         addMetadata(EntryPart, Trunc);
2000       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2001     }
2002   };
2003 
2004   // Now do the actual transformations, and start with creating the step value.
2005   Value *Step = CreateStepValue(ID.getStep());
2006   if (VF.isZero() || VF.isScalar()) {
2007     Value *ScalarIV = CreateScalarIV(Step);
2008     CreateSplatIV(ScalarIV, Step);
2009     return;
2010   }
2011 
2012   // Determine if we want a scalar version of the induction variable. This is
2013   // true if the induction variable itself is not widened, or if it has at
2014   // least one user in the loop that is not widened.
2015   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2016   if (!NeedsScalarIV) {
2017     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2018     return;
2019   }
2020 
2021   // Try to create a new independent vector induction variable. If we can't
2022   // create the phi node, we will splat the scalar induction variable in each
2023   // loop iteration.
2024   if (!shouldScalarizeInstruction(EntryVal)) {
2025     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2026     Value *ScalarIV = CreateScalarIV(Step);
2027     // Create scalar steps that can be used by instructions we will later
2028     // scalarize. Note that the addition of the scalar steps will not increase
2029     // the number of instructions in the loop in the common case prior to
2030     // InstCombine. We will be trading one vector extract for each scalar step.
2031     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2032     return;
2033   }
2034 
2035   // All IV users are scalar instructions, so only emit a scalar IV, not a
2036   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2037   // predicate used by the masked loads/stores.
2038   Value *ScalarIV = CreateScalarIV(Step);
2039   if (!Cost->isScalarEpilogueAllowed())
2040     CreateSplatIV(ScalarIV, Step);
2041   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2042 }
2043 
2044 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2045                                           Instruction::BinaryOps BinOp) {
2046   // Create and check the types.
2047   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2048   int VLen = ValVTy->getNumElements();
2049 
2050   Type *STy = Val->getType()->getScalarType();
2051   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2052          "Induction Step must be an integer or FP");
2053   assert(Step->getType() == STy && "Step has wrong type");
2054 
2055   SmallVector<Constant *, 8> Indices;
2056 
2057   if (STy->isIntegerTy()) {
2058     // Create a vector of consecutive numbers from zero to VF.
2059     for (int i = 0; i < VLen; ++i)
2060       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2061 
2062     // Add the consecutive indices to the vector value.
2063     Constant *Cv = ConstantVector::get(Indices);
2064     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2065     Step = Builder.CreateVectorSplat(VLen, Step);
2066     assert(Step->getType() == Val->getType() && "Invalid step vec");
2067     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2068     // which can be found from the original scalar operations.
2069     Step = Builder.CreateMul(Cv, Step);
2070     return Builder.CreateAdd(Val, Step, "induction");
2071   }
2072 
2073   // Floating point induction.
2074   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2075          "Binary Opcode should be specified for FP induction");
2076   // Create a vector of consecutive numbers from zero to VF.
2077   for (int i = 0; i < VLen; ++i)
2078     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2079 
2080   // Add the consecutive indices to the vector value.
2081   Constant *Cv = ConstantVector::get(Indices);
2082 
2083   Step = Builder.CreateVectorSplat(VLen, Step);
2084 
2085   // Floating point operations had to be 'fast' to enable the induction.
2086   FastMathFlags Flags;
2087   Flags.setFast();
2088 
2089   Value *MulOp = Builder.CreateFMul(Cv, Step);
2090   if (isa<Instruction>(MulOp))
2091     // Have to check, MulOp may be a constant
2092     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2093 
2094   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2095   if (isa<Instruction>(BOp))
2096     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2097   return BOp;
2098 }
2099 
2100 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2101                                            Instruction *EntryVal,
2102                                            const InductionDescriptor &ID) {
2103   // We shouldn't have to build scalar steps if we aren't vectorizing.
2104   assert(VF.isVector() && "VF should be greater than one");
2105   assert(!VF.isScalable() &&
2106          "the code below assumes a fixed number of elements at compile time");
2107   // Get the value type and ensure it and the step have the same integer type.
2108   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2109   assert(ScalarIVTy == Step->getType() &&
2110          "Val and Step should have the same type");
2111 
2112   // We build scalar steps for both integer and floating-point induction
2113   // variables. Here, we determine the kind of arithmetic we will perform.
2114   Instruction::BinaryOps AddOp;
2115   Instruction::BinaryOps MulOp;
2116   if (ScalarIVTy->isIntegerTy()) {
2117     AddOp = Instruction::Add;
2118     MulOp = Instruction::Mul;
2119   } else {
2120     AddOp = ID.getInductionOpcode();
2121     MulOp = Instruction::FMul;
2122   }
2123 
2124   // Determine the number of scalars we need to generate for each unroll
2125   // iteration. If EntryVal is uniform, we only need to generate the first
2126   // lane. Otherwise, we generate all VF values.
2127   unsigned Lanes =
2128       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2129           ? 1
2130           : VF.getKnownMinValue();
2131   // Compute the scalar steps and save the results in VectorLoopValueMap.
2132   for (unsigned Part = 0; Part < UF; ++Part) {
2133     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2134       auto *StartIdx = getSignedIntOrFpConstant(
2135           ScalarIVTy, VF.getKnownMinValue() * Part + Lane);
2136       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2137       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2138       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2139       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2140     }
2141   }
2142 }
2143 
2144 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2145   assert(V != Induction && "The new induction variable should not be used.");
2146   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2147   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2148 
2149   // If we have a stride that is replaced by one, do it here. Defer this for
2150   // the VPlan-native path until we start running Legal checks in that path.
2151   if (!EnableVPlanNativePath && Legal->hasStride(V))
2152     V = ConstantInt::get(V->getType(), 1);
2153 
2154   // If we have a vector mapped to this value, return it.
2155   if (VectorLoopValueMap.hasVectorValue(V, Part))
2156     return VectorLoopValueMap.getVectorValue(V, Part);
2157 
2158   // If the value has not been vectorized, check if it has been scalarized
2159   // instead. If it has been scalarized, and we actually need the value in
2160   // vector form, we will construct the vector values on demand.
2161   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2162     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2163 
2164     // If we've scalarized a value, that value should be an instruction.
2165     auto *I = cast<Instruction>(V);
2166 
2167     // If we aren't vectorizing, we can just copy the scalar map values over to
2168     // the vector map.
2169     if (VF == 1) {
2170       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2171       return ScalarValue;
2172     }
2173 
2174     // Get the last scalar instruction we generated for V and Part. If the value
2175     // is known to be uniform after vectorization, this corresponds to lane zero
2176     // of the Part unroll iteration. Otherwise, the last instruction is the one
2177     // we created for the last vector lane of the Part unroll iteration.
2178     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2179     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2180                             ? 0
2181                             : VF.getKnownMinValue() - 1;
2182     auto *LastInst = cast<Instruction>(
2183         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2184 
2185     // Set the insert point after the last scalarized instruction. This ensures
2186     // the insertelement sequence will directly follow the scalar definitions.
2187     auto OldIP = Builder.saveIP();
2188     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2189     Builder.SetInsertPoint(&*NewIP);
2190 
2191     // However, if we are vectorizing, we need to construct the vector values.
2192     // If the value is known to be uniform after vectorization, we can just
2193     // broadcast the scalar value corresponding to lane zero for each unroll
2194     // iteration. Otherwise, we construct the vector values using insertelement
2195     // instructions. Since the resulting vectors are stored in
2196     // VectorLoopValueMap, we will only generate the insertelements once.
2197     Value *VectorValue = nullptr;
2198     if (Cost->isUniformAfterVectorization(I, VF)) {
2199       VectorValue = getBroadcastInstrs(ScalarValue);
2200       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2201     } else {
2202       // Initialize packing with insertelements to start from undef.
2203       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2204       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2205       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2206       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2207         packScalarIntoVectorValue(V, {Part, Lane});
2208       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2209     }
2210     Builder.restoreIP(OldIP);
2211     return VectorValue;
2212   }
2213 
2214   // If this scalar is unknown, assume that it is a constant or that it is
2215   // loop invariant. Broadcast V and save the value for future uses.
2216   Value *B = getBroadcastInstrs(V);
2217   VectorLoopValueMap.setVectorValue(V, Part, B);
2218   return B;
2219 }
2220 
2221 Value *
2222 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2223                                             const VPIteration &Instance) {
2224   // If the value is not an instruction contained in the loop, it should
2225   // already be scalar.
2226   if (OrigLoop->isLoopInvariant(V))
2227     return V;
2228 
2229   assert(Instance.Lane > 0
2230              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2231              : true && "Uniform values only have lane zero");
2232 
2233   // If the value from the original loop has not been vectorized, it is
2234   // represented by UF x VF scalar values in the new loop. Return the requested
2235   // scalar value.
2236   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2237     return VectorLoopValueMap.getScalarValue(V, Instance);
2238 
2239   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2240   // for the given unroll part. If this entry is not a vector type (i.e., the
2241   // vectorization factor is one), there is no need to generate an
2242   // extractelement instruction.
2243   auto *U = getOrCreateVectorValue(V, Instance.Part);
2244   if (!U->getType()->isVectorTy()) {
2245     assert(VF == 1 && "Value not scalarized has non-vector type");
2246     return U;
2247   }
2248 
2249   // Otherwise, the value from the original loop has been vectorized and is
2250   // represented by UF vector values. Extract and return the requested scalar
2251   // value from the appropriate vector lane.
2252   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2253 }
2254 
2255 void InnerLoopVectorizer::packScalarIntoVectorValue(
2256     Value *V, const VPIteration &Instance) {
2257   assert(V != Induction && "The new induction variable should not be used.");
2258   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2259   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2260 
2261   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2262   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2263   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2264                                             Builder.getInt32(Instance.Lane));
2265   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2266 }
2267 
2268 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2269   assert(Vec->getType()->isVectorTy() && "Invalid type");
2270   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2271   SmallVector<int, 8> ShuffleMask;
2272   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2273     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2274 
2275   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2276 }
2277 
2278 // Return whether we allow using masked interleave-groups (for dealing with
2279 // strided loads/stores that reside in predicated blocks, or for dealing
2280 // with gaps).
2281 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2282   // If an override option has been passed in for interleaved accesses, use it.
2283   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2284     return EnableMaskedInterleavedMemAccesses;
2285 
2286   return TTI.enableMaskedInterleavedAccessVectorization();
2287 }
2288 
2289 // Try to vectorize the interleave group that \p Instr belongs to.
2290 //
2291 // E.g. Translate following interleaved load group (factor = 3):
2292 //   for (i = 0; i < N; i+=3) {
2293 //     R = Pic[i];             // Member of index 0
2294 //     G = Pic[i+1];           // Member of index 1
2295 //     B = Pic[i+2];           // Member of index 2
2296 //     ... // do something to R, G, B
2297 //   }
2298 // To:
2299 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2300 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2301 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2302 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2303 //
2304 // Or translate following interleaved store group (factor = 3):
2305 //   for (i = 0; i < N; i+=3) {
2306 //     ... do something to R, G, B
2307 //     Pic[i]   = R;           // Member of index 0
2308 //     Pic[i+1] = G;           // Member of index 1
2309 //     Pic[i+2] = B;           // Member of index 2
2310 //   }
2311 // To:
2312 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2313 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2314 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2315 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2316 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2317 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2318     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2319     VPValue *Addr, VPValue *BlockInMask) {
2320   Instruction *Instr = Group->getInsertPos();
2321   const DataLayout &DL = Instr->getModule()->getDataLayout();
2322 
2323   // Prepare for the vector type of the interleaved load/store.
2324   Type *ScalarTy = getMemInstValueType(Instr);
2325   unsigned InterleaveFactor = Group->getFactor();
2326   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2327   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2328 
2329   // Prepare for the new pointers.
2330   SmallVector<Value *, 2> AddrParts;
2331   unsigned Index = Group->getIndex(Instr);
2332 
2333   // TODO: extend the masked interleaved-group support to reversed access.
2334   assert((!BlockInMask || !Group->isReverse()) &&
2335          "Reversed masked interleave-group not supported.");
2336 
2337   // If the group is reverse, adjust the index to refer to the last vector lane
2338   // instead of the first. We adjust the index from the first vector lane,
2339   // rather than directly getting the pointer for lane VF - 1, because the
2340   // pointer operand of the interleaved access is supposed to be uniform. For
2341   // uniform instructions, we're only required to generate a value for the
2342   // first vector lane in each unroll iteration.
2343   assert(!VF.isScalable() &&
2344          "scalable vector reverse operation is not implemented");
2345   if (Group->isReverse())
2346     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2347 
2348   for (unsigned Part = 0; Part < UF; Part++) {
2349     Value *AddrPart = State.get(Addr, {Part, 0});
2350     setDebugLocFromInst(Builder, AddrPart);
2351 
2352     // Notice current instruction could be any index. Need to adjust the address
2353     // to the member of index 0.
2354     //
2355     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2356     //       b = A[i];       // Member of index 0
2357     // Current pointer is pointed to A[i+1], adjust it to A[i].
2358     //
2359     // E.g.  A[i+1] = a;     // Member of index 1
2360     //       A[i]   = b;     // Member of index 0
2361     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2362     // Current pointer is pointed to A[i+2], adjust it to A[i].
2363 
2364     bool InBounds = false;
2365     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2366       InBounds = gep->isInBounds();
2367     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2368     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2369 
2370     // Cast to the vector pointer type.
2371     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2372     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2373     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2374   }
2375 
2376   setDebugLocFromInst(Builder, Instr);
2377   Value *UndefVec = UndefValue::get(VecTy);
2378 
2379   Value *MaskForGaps = nullptr;
2380   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2381     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2382     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2383     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2384   }
2385 
2386   // Vectorize the interleaved load group.
2387   if (isa<LoadInst>(Instr)) {
2388     // For each unroll part, create a wide load for the group.
2389     SmallVector<Value *, 2> NewLoads;
2390     for (unsigned Part = 0; Part < UF; Part++) {
2391       Instruction *NewLoad;
2392       if (BlockInMask || MaskForGaps) {
2393         assert(useMaskedInterleavedAccesses(*TTI) &&
2394                "masked interleaved groups are not allowed.");
2395         Value *GroupMask = MaskForGaps;
2396         if (BlockInMask) {
2397           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2398           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2399           Value *ShuffledMask = Builder.CreateShuffleVector(
2400               BlockInMaskPart,
2401               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2402               "interleaved.mask");
2403           GroupMask = MaskForGaps
2404                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2405                                                 MaskForGaps)
2406                           : ShuffledMask;
2407         }
2408         NewLoad =
2409             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2410                                      GroupMask, UndefVec, "wide.masked.vec");
2411       }
2412       else
2413         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2414                                             Group->getAlign(), "wide.vec");
2415       Group->addMetadata(NewLoad);
2416       NewLoads.push_back(NewLoad);
2417     }
2418 
2419     // For each member in the group, shuffle out the appropriate data from the
2420     // wide loads.
2421     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2422       Instruction *Member = Group->getMember(I);
2423 
2424       // Skip the gaps in the group.
2425       if (!Member)
2426         continue;
2427 
2428       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2429       auto StrideMask =
2430           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2431       for (unsigned Part = 0; Part < UF; Part++) {
2432         Value *StridedVec = Builder.CreateShuffleVector(
2433             NewLoads[Part], StrideMask, "strided.vec");
2434 
2435         // If this member has different type, cast the result type.
2436         if (Member->getType() != ScalarTy) {
2437           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2438           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2439           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2440         }
2441 
2442         if (Group->isReverse())
2443           StridedVec = reverseVector(StridedVec);
2444 
2445         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2446       }
2447     }
2448     return;
2449   }
2450 
2451   // The sub vector type for current instruction.
2452   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2453   auto *SubVT = VectorType::get(ScalarTy, VF);
2454 
2455   // Vectorize the interleaved store group.
2456   for (unsigned Part = 0; Part < UF; Part++) {
2457     // Collect the stored vector from each member.
2458     SmallVector<Value *, 4> StoredVecs;
2459     for (unsigned i = 0; i < InterleaveFactor; i++) {
2460       // Interleaved store group doesn't allow a gap, so each index has a member
2461       Instruction *Member = Group->getMember(i);
2462       assert(Member && "Fail to get a member from an interleaved store group");
2463 
2464       Value *StoredVec = getOrCreateVectorValue(
2465           cast<StoreInst>(Member)->getValueOperand(), Part);
2466       if (Group->isReverse())
2467         StoredVec = reverseVector(StoredVec);
2468 
2469       // If this member has different type, cast it to a unified type.
2470 
2471       if (StoredVec->getType() != SubVT)
2472         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2473 
2474       StoredVecs.push_back(StoredVec);
2475     }
2476 
2477     // Concatenate all vectors into a wide vector.
2478     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2479 
2480     // Interleave the elements in the wide vector.
2481     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2482     Value *IVec = Builder.CreateShuffleVector(
2483         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2484         "interleaved.vec");
2485 
2486     Instruction *NewStoreInstr;
2487     if (BlockInMask) {
2488       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2489       Value *ShuffledMask = Builder.CreateShuffleVector(
2490           BlockInMaskPart,
2491           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2492           "interleaved.mask");
2493       NewStoreInstr = Builder.CreateMaskedStore(
2494           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2495     }
2496     else
2497       NewStoreInstr =
2498           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2499 
2500     Group->addMetadata(NewStoreInstr);
2501   }
2502 }
2503 
2504 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2505                                                      VPTransformState &State,
2506                                                      VPValue *Addr,
2507                                                      VPValue *StoredValue,
2508                                                      VPValue *BlockInMask) {
2509   // Attempt to issue a wide load.
2510   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2511   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2512 
2513   assert((LI || SI) && "Invalid Load/Store instruction");
2514   assert((!SI || StoredValue) && "No stored value provided for widened store");
2515   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2516 
2517   LoopVectorizationCostModel::InstWidening Decision =
2518       Cost->getWideningDecision(Instr, VF);
2519   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2520           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2521           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2522          "CM decision is not to widen the memory instruction");
2523 
2524   Type *ScalarDataTy = getMemInstValueType(Instr);
2525 
2526   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2527   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2528   const Align Alignment = getLoadStoreAlignment(Instr);
2529 
2530   // Determine if the pointer operand of the access is either consecutive or
2531   // reverse consecutive.
2532   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2533   bool ConsecutiveStride =
2534       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2535   bool CreateGatherScatter =
2536       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2537 
2538   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2539   // gather/scatter. Otherwise Decision should have been to Scalarize.
2540   assert((ConsecutiveStride || CreateGatherScatter) &&
2541          "The instruction should be scalarized");
2542   (void)ConsecutiveStride;
2543 
2544   VectorParts BlockInMaskParts(UF);
2545   bool isMaskRequired = BlockInMask;
2546   if (isMaskRequired)
2547     for (unsigned Part = 0; Part < UF; ++Part)
2548       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2549 
2550   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2551     // Calculate the pointer for the specific unroll-part.
2552     GetElementPtrInst *PartPtr = nullptr;
2553 
2554     bool InBounds = false;
2555     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2556       InBounds = gep->isInBounds();
2557 
2558     if (Reverse) {
2559       // If the address is consecutive but reversed, then the
2560       // wide store needs to start at the last vector element.
2561       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2562           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2563       PartPtr->setIsInBounds(InBounds);
2564       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2565           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2566       PartPtr->setIsInBounds(InBounds);
2567       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2568         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2569     } else {
2570       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2571           ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));
2572       PartPtr->setIsInBounds(InBounds);
2573     }
2574 
2575     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2576     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2577   };
2578 
2579   // Handle Stores:
2580   if (SI) {
2581     setDebugLocFromInst(Builder, SI);
2582 
2583     for (unsigned Part = 0; Part < UF; ++Part) {
2584       Instruction *NewSI = nullptr;
2585       Value *StoredVal = State.get(StoredValue, Part);
2586       if (CreateGatherScatter) {
2587         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2588         Value *VectorGep = State.get(Addr, Part);
2589         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2590                                             MaskPart);
2591       } else {
2592         if (Reverse) {
2593           // If we store to reverse consecutive memory locations, then we need
2594           // to reverse the order of elements in the stored value.
2595           StoredVal = reverseVector(StoredVal);
2596           // We don't want to update the value in the map as it might be used in
2597           // another expression. So don't call resetVectorValue(StoredVal).
2598         }
2599         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2600         if (isMaskRequired)
2601           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2602                                             BlockInMaskParts[Part]);
2603         else
2604           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2605       }
2606       addMetadata(NewSI, SI);
2607     }
2608     return;
2609   }
2610 
2611   // Handle loads.
2612   assert(LI && "Must have a load instruction");
2613   setDebugLocFromInst(Builder, LI);
2614   for (unsigned Part = 0; Part < UF; ++Part) {
2615     Value *NewLI;
2616     if (CreateGatherScatter) {
2617       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2618       Value *VectorGep = State.get(Addr, Part);
2619       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2620                                          nullptr, "wide.masked.gather");
2621       addMetadata(NewLI, LI);
2622     } else {
2623       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2624       if (isMaskRequired)
2625         NewLI = Builder.CreateMaskedLoad(
2626             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2627             "wide.masked.load");
2628       else
2629         NewLI =
2630             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2631 
2632       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2633       addMetadata(NewLI, LI);
2634       if (Reverse)
2635         NewLI = reverseVector(NewLI);
2636     }
2637     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2638   }
2639 }
2640 
2641 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2642                                                const VPIteration &Instance,
2643                                                bool IfPredicateInstr,
2644                                                VPTransformState &State) {
2645   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2646 
2647   setDebugLocFromInst(Builder, Instr);
2648 
2649   // Does this instruction return a value ?
2650   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2651 
2652   Instruction *Cloned = Instr->clone();
2653   if (!IsVoidRetTy)
2654     Cloned->setName(Instr->getName() + ".cloned");
2655 
2656   // Replace the operands of the cloned instructions with their scalar
2657   // equivalents in the new loop.
2658   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2659     auto *NewOp = State.get(User.getOperand(op), Instance);
2660     Cloned->setOperand(op, NewOp);
2661   }
2662   addNewMetadata(Cloned, Instr);
2663 
2664   // Place the cloned scalar in the new loop.
2665   Builder.Insert(Cloned);
2666 
2667   // Add the cloned scalar to the scalar map entry.
2668   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2669 
2670   // If we just cloned a new assumption, add it the assumption cache.
2671   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2672     if (II->getIntrinsicID() == Intrinsic::assume)
2673       AC->registerAssumption(II);
2674 
2675   // End if-block.
2676   if (IfPredicateInstr)
2677     PredicatedInstructions.push_back(Cloned);
2678 }
2679 
2680 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2681                                                       Value *End, Value *Step,
2682                                                       Instruction *DL) {
2683   BasicBlock *Header = L->getHeader();
2684   BasicBlock *Latch = L->getLoopLatch();
2685   // As we're just creating this loop, it's possible no latch exists
2686   // yet. If so, use the header as this will be a single block loop.
2687   if (!Latch)
2688     Latch = Header;
2689 
2690   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2691   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2692   setDebugLocFromInst(Builder, OldInst);
2693   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2694 
2695   Builder.SetInsertPoint(Latch->getTerminator());
2696   setDebugLocFromInst(Builder, OldInst);
2697 
2698   // Create i+1 and fill the PHINode.
2699   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2700   Induction->addIncoming(Start, L->getLoopPreheader());
2701   Induction->addIncoming(Next, Latch);
2702   // Create the compare.
2703   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2704   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2705 
2706   // Now we have two terminators. Remove the old one from the block.
2707   Latch->getTerminator()->eraseFromParent();
2708 
2709   return Induction;
2710 }
2711 
2712 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2713   if (TripCount)
2714     return TripCount;
2715 
2716   assert(L && "Create Trip Count for null loop.");
2717   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2718   // Find the loop boundaries.
2719   ScalarEvolution *SE = PSE.getSE();
2720   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2721   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2722          "Invalid loop count");
2723 
2724   Type *IdxTy = Legal->getWidestInductionType();
2725   assert(IdxTy && "No type for induction");
2726 
2727   // The exit count might have the type of i64 while the phi is i32. This can
2728   // happen if we have an induction variable that is sign extended before the
2729   // compare. The only way that we get a backedge taken count is that the
2730   // induction variable was signed and as such will not overflow. In such a case
2731   // truncation is legal.
2732   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2733       IdxTy->getPrimitiveSizeInBits())
2734     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2735   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2736 
2737   // Get the total trip count from the count by adding 1.
2738   const SCEV *ExitCount = SE->getAddExpr(
2739       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2740 
2741   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2742 
2743   // Expand the trip count and place the new instructions in the preheader.
2744   // Notice that the pre-header does not change, only the loop body.
2745   SCEVExpander Exp(*SE, DL, "induction");
2746 
2747   // Count holds the overall loop count (N).
2748   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2749                                 L->getLoopPreheader()->getTerminator());
2750 
2751   if (TripCount->getType()->isPointerTy())
2752     TripCount =
2753         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2754                                     L->getLoopPreheader()->getTerminator());
2755 
2756   return TripCount;
2757 }
2758 
2759 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2760   if (VectorTripCount)
2761     return VectorTripCount;
2762 
2763   Value *TC = getOrCreateTripCount(L);
2764   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2765 
2766   Type *Ty = TC->getType();
2767   // This is where we can make the step a runtime constant.
2768   assert(!VF.isScalable() && "scalable vectorization is not supported yet");
2769   Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF);
2770 
2771   // If the tail is to be folded by masking, round the number of iterations N
2772   // up to a multiple of Step instead of rounding down. This is done by first
2773   // adding Step-1 and then rounding down. Note that it's ok if this addition
2774   // overflows: the vector induction variable will eventually wrap to zero given
2775   // that it starts at zero and its Step is a power of two; the loop will then
2776   // exit, with the last early-exit vector comparison also producing all-true.
2777   if (Cost->foldTailByMasking()) {
2778     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2779            "VF*UF must be a power of 2 when folding tail by masking");
2780     TC = Builder.CreateAdd(
2781         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2782   }
2783 
2784   // Now we need to generate the expression for the part of the loop that the
2785   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2786   // iterations are not required for correctness, or N - Step, otherwise. Step
2787   // is equal to the vectorization factor (number of SIMD elements) times the
2788   // unroll factor (number of SIMD instructions).
2789   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2790 
2791   // If there is a non-reversed interleaved group that may speculatively access
2792   // memory out-of-bounds, we need to ensure that there will be at least one
2793   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2794   // the trip count, we set the remainder to be equal to the step. If the step
2795   // does not evenly divide the trip count, no adjustment is necessary since
2796   // there will already be scalar iterations. Note that the minimum iterations
2797   // check ensures that N >= Step.
2798   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
2799     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2800     R = Builder.CreateSelect(IsZero, Step, R);
2801   }
2802 
2803   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2804 
2805   return VectorTripCount;
2806 }
2807 
2808 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2809                                                    const DataLayout &DL) {
2810   // Verify that V is a vector type with same number of elements as DstVTy.
2811   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2812   unsigned VF = DstFVTy->getNumElements();
2813   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2814   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2815   Type *SrcElemTy = SrcVecTy->getElementType();
2816   Type *DstElemTy = DstFVTy->getElementType();
2817   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2818          "Vector elements must have same size");
2819 
2820   // Do a direct cast if element types are castable.
2821   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2822     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2823   }
2824   // V cannot be directly casted to desired vector type.
2825   // May happen when V is a floating point vector but DstVTy is a vector of
2826   // pointers or vice-versa. Handle this using a two-step bitcast using an
2827   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2828   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2829          "Only one type should be a pointer type");
2830   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2831          "Only one type should be a floating point type");
2832   Type *IntTy =
2833       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2834   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2835   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2836   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2837 }
2838 
2839 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2840                                                          BasicBlock *Bypass) {
2841   Value *Count = getOrCreateTripCount(L);
2842   // Reuse existing vector loop preheader for TC checks.
2843   // Note that new preheader block is generated for vector loop.
2844   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2845   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2846 
2847   // Generate code to check if the loop's trip count is less than VF * UF, or
2848   // equal to it in case a scalar epilogue is required; this implies that the
2849   // vector trip count is zero. This check also covers the case where adding one
2850   // to the backedge-taken count overflowed leading to an incorrect trip count
2851   // of zero. In this case we will also jump to the scalar loop.
2852   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2853                                           : ICmpInst::ICMP_ULT;
2854 
2855   // If tail is to be folded, vector loop takes care of all iterations.
2856   Value *CheckMinIters = Builder.getFalse();
2857   if (!Cost->foldTailByMasking()) {
2858     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2859     CheckMinIters = Builder.CreateICmp(
2860         P, Count,
2861         ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
2862         "min.iters.check");
2863   }
2864   // Create new preheader for vector loop.
2865   LoopVectorPreHeader =
2866       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2867                  "vector.ph");
2868 
2869   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2870                                DT->getNode(Bypass)->getIDom()) &&
2871          "TC check is expected to dominate Bypass");
2872 
2873   // Update dominator for Bypass & LoopExit.
2874   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2875   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2876 
2877   ReplaceInstWithInst(
2878       TCCheckBlock->getTerminator(),
2879       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2880   LoopBypassBlocks.push_back(TCCheckBlock);
2881 }
2882 
2883 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2884   // Reuse existing vector loop preheader for SCEV checks.
2885   // Note that new preheader block is generated for vector loop.
2886   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2887 
2888   // Generate the code to check that the SCEV assumptions that we made.
2889   // We want the new basic block to start at the first instruction in a
2890   // sequence of instructions that form a check.
2891   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2892                    "scev.check");
2893   Value *SCEVCheck = Exp.expandCodeForPredicate(
2894       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2895 
2896   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2897     if (C->isZero())
2898       return;
2899 
2900   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2901            (OptForSizeBasedOnProfile &&
2902             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2903          "Cannot SCEV check stride or overflow when optimizing for size");
2904 
2905   SCEVCheckBlock->setName("vector.scevcheck");
2906   // Create new preheader for vector loop.
2907   LoopVectorPreHeader =
2908       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2909                  nullptr, "vector.ph");
2910 
2911   // Update dominator only if this is first RT check.
2912   if (LoopBypassBlocks.empty()) {
2913     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2914     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2915   }
2916 
2917   ReplaceInstWithInst(
2918       SCEVCheckBlock->getTerminator(),
2919       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2920   LoopBypassBlocks.push_back(SCEVCheckBlock);
2921   AddedSafetyChecks = true;
2922 }
2923 
2924 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2925   // VPlan-native path does not do any analysis for runtime checks currently.
2926   if (EnableVPlanNativePath)
2927     return;
2928 
2929   // Reuse existing vector loop preheader for runtime memory checks.
2930   // Note that new preheader block is generated for vector loop.
2931   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2932 
2933   // Generate the code that checks in runtime if arrays overlap. We put the
2934   // checks into a separate block to make the more common case of few elements
2935   // faster.
2936   auto *LAI = Legal->getLAI();
2937   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
2938   if (!RtPtrChecking.Need)
2939     return;
2940 
2941   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2942     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2943            "Cannot emit memory checks when optimizing for size, unless forced "
2944            "to vectorize.");
2945     ORE->emit([&]() {
2946       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2947                                         L->getStartLoc(), L->getHeader())
2948              << "Code-size may be reduced by not forcing "
2949                 "vectorization, or by source-code modifications "
2950                 "eliminating the need for runtime checks "
2951                 "(e.g., adding 'restrict').";
2952     });
2953   }
2954 
2955   MemCheckBlock->setName("vector.memcheck");
2956   // Create new preheader for vector loop.
2957   LoopVectorPreHeader =
2958       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2959                  "vector.ph");
2960 
2961   auto *CondBranch = cast<BranchInst>(
2962       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
2963   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
2964   LoopBypassBlocks.push_back(MemCheckBlock);
2965   AddedSafetyChecks = true;
2966 
2967   // Update dominator only if this is first RT check.
2968   if (LoopBypassBlocks.empty()) {
2969     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2970     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2971   }
2972 
2973   Instruction *FirstCheckInst;
2974   Instruction *MemRuntimeCheck;
2975   std::tie(FirstCheckInst, MemRuntimeCheck) =
2976       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
2977                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
2978   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
2979                             "claimed checks are required");
2980   CondBranch->setCondition(MemRuntimeCheck);
2981 
2982   // We currently don't use LoopVersioning for the actual loop cloning but we
2983   // still use it to add the noalias metadata.
2984   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2985                                           PSE.getSE());
2986   LVer->prepareNoAliasMetadata();
2987 }
2988 
2989 Value *InnerLoopVectorizer::emitTransformedIndex(
2990     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2991     const InductionDescriptor &ID) const {
2992 
2993   SCEVExpander Exp(*SE, DL, "induction");
2994   auto Step = ID.getStep();
2995   auto StartValue = ID.getStartValue();
2996   assert(Index->getType() == Step->getType() &&
2997          "Index type does not match StepValue type");
2998 
2999   // Note: the IR at this point is broken. We cannot use SE to create any new
3000   // SCEV and then expand it, hoping that SCEV's simplification will give us
3001   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3002   // lead to various SCEV crashes. So all we can do is to use builder and rely
3003   // on InstCombine for future simplifications. Here we handle some trivial
3004   // cases only.
3005   auto CreateAdd = [&B](Value *X, Value *Y) {
3006     assert(X->getType() == Y->getType() && "Types don't match!");
3007     if (auto *CX = dyn_cast<ConstantInt>(X))
3008       if (CX->isZero())
3009         return Y;
3010     if (auto *CY = dyn_cast<ConstantInt>(Y))
3011       if (CY->isZero())
3012         return X;
3013     return B.CreateAdd(X, Y);
3014   };
3015 
3016   auto CreateMul = [&B](Value *X, Value *Y) {
3017     assert(X->getType() == Y->getType() && "Types don't match!");
3018     if (auto *CX = dyn_cast<ConstantInt>(X))
3019       if (CX->isOne())
3020         return Y;
3021     if (auto *CY = dyn_cast<ConstantInt>(Y))
3022       if (CY->isOne())
3023         return X;
3024     return B.CreateMul(X, Y);
3025   };
3026 
3027   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3028   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3029   // the DomTree is not kept up-to-date for additional blocks generated in the
3030   // vector loop. By using the header as insertion point, we guarantee that the
3031   // expanded instructions dominate all their uses.
3032   auto GetInsertPoint = [this, &B]() {
3033     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3034     if (InsertBB != LoopVectorBody &&
3035         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3036       return LoopVectorBody->getTerminator();
3037     return &*B.GetInsertPoint();
3038   };
3039   switch (ID.getKind()) {
3040   case InductionDescriptor::IK_IntInduction: {
3041     assert(Index->getType() == StartValue->getType() &&
3042            "Index type does not match StartValue type");
3043     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3044       return B.CreateSub(StartValue, Index);
3045     auto *Offset = CreateMul(
3046         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3047     return CreateAdd(StartValue, Offset);
3048   }
3049   case InductionDescriptor::IK_PtrInduction: {
3050     assert(isa<SCEVConstant>(Step) &&
3051            "Expected constant step for pointer induction");
3052     return B.CreateGEP(
3053         StartValue->getType()->getPointerElementType(), StartValue,
3054         CreateMul(Index,
3055                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3056   }
3057   case InductionDescriptor::IK_FpInduction: {
3058     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3059     auto InductionBinOp = ID.getInductionBinOp();
3060     assert(InductionBinOp &&
3061            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3062             InductionBinOp->getOpcode() == Instruction::FSub) &&
3063            "Original bin op should be defined for FP induction");
3064 
3065     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3066 
3067     // Floating point operations had to be 'fast' to enable the induction.
3068     FastMathFlags Flags;
3069     Flags.setFast();
3070 
3071     Value *MulExp = B.CreateFMul(StepValue, Index);
3072     if (isa<Instruction>(MulExp))
3073       // We have to check, the MulExp may be a constant.
3074       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3075 
3076     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3077                                "induction");
3078     if (isa<Instruction>(BOp))
3079       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3080 
3081     return BOp;
3082   }
3083   case InductionDescriptor::IK_NoInduction:
3084     return nullptr;
3085   }
3086   llvm_unreachable("invalid enum");
3087 }
3088 
3089 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3090   LoopScalarBody = OrigLoop->getHeader();
3091   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3092   LoopExitBlock = OrigLoop->getExitBlock();
3093   assert(LoopExitBlock && "Must have an exit block");
3094   assert(LoopVectorPreHeader && "Invalid loop structure");
3095 
3096   LoopMiddleBlock =
3097       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3098                  LI, nullptr, Twine(Prefix) + "middle.block");
3099   LoopScalarPreHeader =
3100       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3101                  nullptr, Twine(Prefix) + "scalar.ph");
3102   // We intentionally don't let SplitBlock to update LoopInfo since
3103   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3104   // LoopVectorBody is explicitly added to the correct place few lines later.
3105   LoopVectorBody =
3106       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3107                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3108 
3109   // Update dominator for loop exit.
3110   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3111 
3112   // Create and register the new vector loop.
3113   Loop *Lp = LI->AllocateLoop();
3114   Loop *ParentLoop = OrigLoop->getParentLoop();
3115 
3116   // Insert the new loop into the loop nest and register the new basic blocks
3117   // before calling any utilities such as SCEV that require valid LoopInfo.
3118   if (ParentLoop) {
3119     ParentLoop->addChildLoop(Lp);
3120   } else {
3121     LI->addTopLevelLoop(Lp);
3122   }
3123   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3124   return Lp;
3125 }
3126 
3127 void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
3128                                                       Value *VectorTripCount) {
3129   assert(VectorTripCount && L && "Expected valid arguments");
3130   // We are going to resume the execution of the scalar loop.
3131   // Go over all of the induction variables that we found and fix the
3132   // PHIs that are left in the scalar version of the loop.
3133   // The starting values of PHI nodes depend on the counter of the last
3134   // iteration in the vectorized loop.
3135   // If we come from a bypass edge then we need to start from the original
3136   // start value.
3137   for (auto &InductionEntry : Legal->getInductionVars()) {
3138     PHINode *OrigPhi = InductionEntry.first;
3139     InductionDescriptor II = InductionEntry.second;
3140 
3141     // Create phi nodes to merge from the  backedge-taken check block.
3142     PHINode *BCResumeVal =
3143         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3144                         LoopScalarPreHeader->getTerminator());
3145     // Copy original phi DL over to the new one.
3146     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3147     Value *&EndValue = IVEndValues[OrigPhi];
3148     if (OrigPhi == OldInduction) {
3149       // We know what the end value is.
3150       EndValue = VectorTripCount;
3151     } else {
3152       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3153       Type *StepType = II.getStep()->getType();
3154       Instruction::CastOps CastOp =
3155           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3156       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3157       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3158       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3159       EndValue->setName("ind.end");
3160     }
3161 
3162     // The new PHI merges the original incoming value, in case of a bypass,
3163     // or the value at the end of the vectorized loop.
3164     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3165 
3166     // Fix the scalar body counter (PHI node).
3167     // The old induction's phi node in the scalar body needs the truncated
3168     // value.
3169     for (BasicBlock *BB : LoopBypassBlocks)
3170       BCResumeVal->addIncoming(II.getStartValue(), BB);
3171     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3172   }
3173 }
3174 
3175 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3176                                                       MDNode *OrigLoopID) {
3177   assert(L && "Expected valid loop.");
3178 
3179   // The trip counts should be cached by now.
3180   Value *Count = getOrCreateTripCount(L);
3181   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3182 
3183   // We need the OrigLoop (scalar loop part) latch terminator to help
3184   // produce correct debug info for the middle block BB instructions.
3185   // The legality check stage guarantees that the loop will have a single
3186   // latch.
3187   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3188          "Scalar loop latch terminator isn't a branch");
3189   BranchInst *ScalarLatchBr =
3190       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3191 
3192   // Add a check in the middle block to see if we have completed
3193   // all of the iterations in the first vector loop.
3194   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3195   // If tail is to be folded, we know we don't need to run the remainder.
3196   Value *CmpN = Builder.getTrue();
3197   if (!Cost->foldTailByMasking()) {
3198     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3199                            VectorTripCount, "cmp.n",
3200                            LoopMiddleBlock->getTerminator());
3201 
3202     // Here we use the same DebugLoc as the scalar loop latch branch instead
3203     // of the corresponding compare because they may have ended up with
3204     // different line numbers and we want to avoid awkward line stepping while
3205     // debugging. Eg. if the compare has got a line number inside the loop.
3206     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3207   }
3208 
3209   BranchInst *BrInst =
3210       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3211   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3212   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3213 
3214   // Get ready to start creating new instructions into the vectorized body.
3215   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3216          "Inconsistent vector loop preheader");
3217   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3218 
3219   Optional<MDNode *> VectorizedLoopID =
3220       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3221                                       LLVMLoopVectorizeFollowupVectorized});
3222   if (VectorizedLoopID.hasValue()) {
3223     L->setLoopID(VectorizedLoopID.getValue());
3224 
3225     // Do not setAlreadyVectorized if loop attributes have been defined
3226     // explicitly.
3227     return LoopVectorPreHeader;
3228   }
3229 
3230   // Keep all loop hints from the original loop on the vector loop (we'll
3231   // replace the vectorizer-specific hints below).
3232   if (MDNode *LID = OrigLoop->getLoopID())
3233     L->setLoopID(LID);
3234 
3235   LoopVectorizeHints Hints(L, true, *ORE);
3236   Hints.setAlreadyVectorized();
3237 
3238 #ifdef EXPENSIVE_CHECKS
3239   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3240   LI->verify(*DT);
3241 #endif
3242 
3243   return LoopVectorPreHeader;
3244 }
3245 
3246 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3247   /*
3248    In this function we generate a new loop. The new loop will contain
3249    the vectorized instructions while the old loop will continue to run the
3250    scalar remainder.
3251 
3252        [ ] <-- loop iteration number check.
3253     /   |
3254    /    v
3255   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3256   |  /  |
3257   | /   v
3258   ||   [ ]     <-- vector pre header.
3259   |/    |
3260   |     v
3261   |    [  ] \
3262   |    [  ]_|   <-- vector loop.
3263   |     |
3264   |     v
3265   |   -[ ]   <--- middle-block.
3266   |  /  |
3267   | /   v
3268   -|- >[ ]     <--- new preheader.
3269    |    |
3270    |    v
3271    |   [ ] \
3272    |   [ ]_|   <-- old scalar loop to handle remainder.
3273     \   |
3274      \  v
3275       >[ ]     <-- exit block.
3276    ...
3277    */
3278 
3279   // Get the metadata of the original loop before it gets modified.
3280   MDNode *OrigLoopID = OrigLoop->getLoopID();
3281 
3282   // Create an empty vector loop, and prepare basic blocks for the runtime
3283   // checks.
3284   Loop *Lp = createVectorLoopSkeleton("");
3285 
3286   // Now, compare the new count to zero. If it is zero skip the vector loop and
3287   // jump to the scalar loop. This check also covers the case where the
3288   // backedge-taken count is uint##_max: adding one to it will overflow leading
3289   // to an incorrect trip count of zero. In this (rare) case we will also jump
3290   // to the scalar loop.
3291   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3292 
3293   // Generate the code to check any assumptions that we've made for SCEV
3294   // expressions.
3295   emitSCEVChecks(Lp, LoopScalarPreHeader);
3296 
3297   // Generate the code that checks in runtime if arrays overlap. We put the
3298   // checks into a separate block to make the more common case of few elements
3299   // faster.
3300   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3301 
3302   // Some loops have a single integer induction variable, while other loops
3303   // don't. One example is c++ iterators that often have multiple pointer
3304   // induction variables. In the code below we also support a case where we
3305   // don't have a single induction variable.
3306   //
3307   // We try to obtain an induction variable from the original loop as hard
3308   // as possible. However if we don't find one that:
3309   //   - is an integer
3310   //   - counts from zero, stepping by one
3311   //   - is the size of the widest induction variable type
3312   // then we create a new one.
3313   OldInduction = Legal->getPrimaryInduction();
3314   Type *IdxTy = Legal->getWidestInductionType();
3315   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3316   // The loop step is equal to the vectorization factor (num of SIMD elements)
3317   // times the unroll factor (num of SIMD instructions).
3318   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3319   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
3320   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3321   Induction =
3322       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3323                               getDebugLocFromInstOrOperands(OldInduction));
3324 
3325   // Emit phis for the new starting index of the scalar loop.
3326   createInductionResumeValues(Lp, CountRoundDown);
3327 
3328   return completeLoopSkeleton(Lp, OrigLoopID);
3329 }
3330 
3331 // Fix up external users of the induction variable. At this point, we are
3332 // in LCSSA form, with all external PHIs that use the IV having one input value,
3333 // coming from the remainder loop. We need those PHIs to also have a correct
3334 // value for the IV when arriving directly from the middle block.
3335 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3336                                        const InductionDescriptor &II,
3337                                        Value *CountRoundDown, Value *EndValue,
3338                                        BasicBlock *MiddleBlock) {
3339   // There are two kinds of external IV usages - those that use the value
3340   // computed in the last iteration (the PHI) and those that use the penultimate
3341   // value (the value that feeds into the phi from the loop latch).
3342   // We allow both, but they, obviously, have different values.
3343 
3344   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3345 
3346   DenseMap<Value *, Value *> MissingVals;
3347 
3348   // An external user of the last iteration's value should see the value that
3349   // the remainder loop uses to initialize its own IV.
3350   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3351   for (User *U : PostInc->users()) {
3352     Instruction *UI = cast<Instruction>(U);
3353     if (!OrigLoop->contains(UI)) {
3354       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3355       MissingVals[UI] = EndValue;
3356     }
3357   }
3358 
3359   // An external user of the penultimate value need to see EndValue - Step.
3360   // The simplest way to get this is to recompute it from the constituent SCEVs,
3361   // that is Start + (Step * (CRD - 1)).
3362   for (User *U : OrigPhi->users()) {
3363     auto *UI = cast<Instruction>(U);
3364     if (!OrigLoop->contains(UI)) {
3365       const DataLayout &DL =
3366           OrigLoop->getHeader()->getModule()->getDataLayout();
3367       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3368 
3369       IRBuilder<> B(MiddleBlock->getTerminator());
3370       Value *CountMinusOne = B.CreateSub(
3371           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3372       Value *CMO =
3373           !II.getStep()->getType()->isIntegerTy()
3374               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3375                              II.getStep()->getType())
3376               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3377       CMO->setName("cast.cmo");
3378       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3379       Escape->setName("ind.escape");
3380       MissingVals[UI] = Escape;
3381     }
3382   }
3383 
3384   for (auto &I : MissingVals) {
3385     PHINode *PHI = cast<PHINode>(I.first);
3386     // One corner case we have to handle is two IVs "chasing" each-other,
3387     // that is %IV2 = phi [...], [ %IV1, %latch ]
3388     // In this case, if IV1 has an external use, we need to avoid adding both
3389     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3390     // don't already have an incoming value for the middle block.
3391     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3392       PHI->addIncoming(I.second, MiddleBlock);
3393   }
3394 }
3395 
3396 namespace {
3397 
3398 struct CSEDenseMapInfo {
3399   static bool canHandle(const Instruction *I) {
3400     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3401            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3402   }
3403 
3404   static inline Instruction *getEmptyKey() {
3405     return DenseMapInfo<Instruction *>::getEmptyKey();
3406   }
3407 
3408   static inline Instruction *getTombstoneKey() {
3409     return DenseMapInfo<Instruction *>::getTombstoneKey();
3410   }
3411 
3412   static unsigned getHashValue(const Instruction *I) {
3413     assert(canHandle(I) && "Unknown instruction!");
3414     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3415                                                            I->value_op_end()));
3416   }
3417 
3418   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3419     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3420         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3421       return LHS == RHS;
3422     return LHS->isIdenticalTo(RHS);
3423   }
3424 };
3425 
3426 } // end anonymous namespace
3427 
3428 ///Perform cse of induction variable instructions.
3429 static void cse(BasicBlock *BB) {
3430   // Perform simple cse.
3431   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3432   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3433     Instruction *In = &*I++;
3434 
3435     if (!CSEDenseMapInfo::canHandle(In))
3436       continue;
3437 
3438     // Check if we can replace this instruction with any of the
3439     // visited instructions.
3440     if (Instruction *V = CSEMap.lookup(In)) {
3441       In->replaceAllUsesWith(V);
3442       In->eraseFromParent();
3443       continue;
3444     }
3445 
3446     CSEMap[In] = In;
3447   }
3448 }
3449 
3450 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3451                                                        ElementCount VF,
3452                                                        bool &NeedToScalarize) {
3453   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3454   Function *F = CI->getCalledFunction();
3455   Type *ScalarRetTy = CI->getType();
3456   SmallVector<Type *, 4> Tys, ScalarTys;
3457   for (auto &ArgOp : CI->arg_operands())
3458     ScalarTys.push_back(ArgOp->getType());
3459 
3460   // Estimate cost of scalarized vector call. The source operands are assumed
3461   // to be vectors, so we need to extract individual elements from there,
3462   // execute VF scalar calls, and then gather the result into the vector return
3463   // value.
3464   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3465                                                  TTI::TCK_RecipThroughput);
3466   if (VF.isScalar())
3467     return ScalarCallCost;
3468 
3469   // Compute corresponding vector type for return value and arguments.
3470   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3471   for (Type *ScalarTy : ScalarTys)
3472     Tys.push_back(ToVectorTy(ScalarTy, VF));
3473 
3474   // Compute costs of unpacking argument values for the scalar calls and
3475   // packing the return values to a vector.
3476   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3477 
3478   unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3479 
3480   // If we can't emit a vector call for this function, then the currently found
3481   // cost is the cost we need to return.
3482   NeedToScalarize = true;
3483   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3484   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3485 
3486   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3487     return Cost;
3488 
3489   // If the corresponding vector cost is cheaper, return its cost.
3490   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3491                                                  TTI::TCK_RecipThroughput);
3492   if (VectorCallCost < Cost) {
3493     NeedToScalarize = false;
3494     return VectorCallCost;
3495   }
3496   return Cost;
3497 }
3498 
3499 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3500                                                             ElementCount VF) {
3501   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3502   assert(ID && "Expected intrinsic call!");
3503 
3504   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3505   return TTI.getIntrinsicInstrCost(CostAttrs,
3506                                    TargetTransformInfo::TCK_RecipThroughput);
3507 }
3508 
3509 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3510   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3511   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3512   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3513 }
3514 
3515 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3516   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3517   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3518   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3519 }
3520 
3521 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3522   // For every instruction `I` in MinBWs, truncate the operands, create a
3523   // truncated version of `I` and reextend its result. InstCombine runs
3524   // later and will remove any ext/trunc pairs.
3525   SmallPtrSet<Value *, 4> Erased;
3526   for (const auto &KV : Cost->getMinimalBitwidths()) {
3527     // If the value wasn't vectorized, we must maintain the original scalar
3528     // type. The absence of the value from VectorLoopValueMap indicates that it
3529     // wasn't vectorized.
3530     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3531       continue;
3532     for (unsigned Part = 0; Part < UF; ++Part) {
3533       Value *I = getOrCreateVectorValue(KV.first, Part);
3534       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3535         continue;
3536       Type *OriginalTy = I->getType();
3537       Type *ScalarTruncatedTy =
3538           IntegerType::get(OriginalTy->getContext(), KV.second);
3539       auto *TruncatedTy = FixedVectorType::get(
3540           ScalarTruncatedTy,
3541           cast<FixedVectorType>(OriginalTy)->getNumElements());
3542       if (TruncatedTy == OriginalTy)
3543         continue;
3544 
3545       IRBuilder<> B(cast<Instruction>(I));
3546       auto ShrinkOperand = [&](Value *V) -> Value * {
3547         if (auto *ZI = dyn_cast<ZExtInst>(V))
3548           if (ZI->getSrcTy() == TruncatedTy)
3549             return ZI->getOperand(0);
3550         return B.CreateZExtOrTrunc(V, TruncatedTy);
3551       };
3552 
3553       // The actual instruction modification depends on the instruction type,
3554       // unfortunately.
3555       Value *NewI = nullptr;
3556       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3557         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3558                              ShrinkOperand(BO->getOperand(1)));
3559 
3560         // Any wrapping introduced by shrinking this operation shouldn't be
3561         // considered undefined behavior. So, we can't unconditionally copy
3562         // arithmetic wrapping flags to NewI.
3563         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3564       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3565         NewI =
3566             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3567                          ShrinkOperand(CI->getOperand(1)));
3568       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3569         NewI = B.CreateSelect(SI->getCondition(),
3570                               ShrinkOperand(SI->getTrueValue()),
3571                               ShrinkOperand(SI->getFalseValue()));
3572       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3573         switch (CI->getOpcode()) {
3574         default:
3575           llvm_unreachable("Unhandled cast!");
3576         case Instruction::Trunc:
3577           NewI = ShrinkOperand(CI->getOperand(0));
3578           break;
3579         case Instruction::SExt:
3580           NewI = B.CreateSExtOrTrunc(
3581               CI->getOperand(0),
3582               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3583           break;
3584         case Instruction::ZExt:
3585           NewI = B.CreateZExtOrTrunc(
3586               CI->getOperand(0),
3587               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3588           break;
3589         }
3590       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3591         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3592                              ->getNumElements();
3593         auto *O0 = B.CreateZExtOrTrunc(
3594             SI->getOperand(0),
3595             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3596         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3597                              ->getNumElements();
3598         auto *O1 = B.CreateZExtOrTrunc(
3599             SI->getOperand(1),
3600             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3601 
3602         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3603       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3604         // Don't do anything with the operands, just extend the result.
3605         continue;
3606       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3607         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3608                             ->getNumElements();
3609         auto *O0 = B.CreateZExtOrTrunc(
3610             IE->getOperand(0),
3611             FixedVectorType::get(ScalarTruncatedTy, Elements));
3612         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3613         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3614       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3615         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3616                             ->getNumElements();
3617         auto *O0 = B.CreateZExtOrTrunc(
3618             EE->getOperand(0),
3619             FixedVectorType::get(ScalarTruncatedTy, Elements));
3620         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3621       } else {
3622         // If we don't know what to do, be conservative and don't do anything.
3623         continue;
3624       }
3625 
3626       // Lastly, extend the result.
3627       NewI->takeName(cast<Instruction>(I));
3628       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3629       I->replaceAllUsesWith(Res);
3630       cast<Instruction>(I)->eraseFromParent();
3631       Erased.insert(I);
3632       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3633     }
3634   }
3635 
3636   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3637   for (const auto &KV : Cost->getMinimalBitwidths()) {
3638     // If the value wasn't vectorized, we must maintain the original scalar
3639     // type. The absence of the value from VectorLoopValueMap indicates that it
3640     // wasn't vectorized.
3641     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3642       continue;
3643     for (unsigned Part = 0; Part < UF; ++Part) {
3644       Value *I = getOrCreateVectorValue(KV.first, Part);
3645       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3646       if (Inst && Inst->use_empty()) {
3647         Value *NewI = Inst->getOperand(0);
3648         Inst->eraseFromParent();
3649         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3650       }
3651     }
3652   }
3653 }
3654 
3655 void InnerLoopVectorizer::fixVectorizedLoop() {
3656   // Insert truncates and extends for any truncated instructions as hints to
3657   // InstCombine.
3658   if (VF.isVector())
3659     truncateToMinimalBitwidths();
3660 
3661   // Fix widened non-induction PHIs by setting up the PHI operands.
3662   if (OrigPHIsToFix.size()) {
3663     assert(EnableVPlanNativePath &&
3664            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3665     fixNonInductionPHIs();
3666   }
3667 
3668   // At this point every instruction in the original loop is widened to a
3669   // vector form. Now we need to fix the recurrences in the loop. These PHI
3670   // nodes are currently empty because we did not want to introduce cycles.
3671   // This is the second stage of vectorizing recurrences.
3672   fixCrossIterationPHIs();
3673 
3674   // Forget the original basic block.
3675   PSE.getSE()->forgetLoop(OrigLoop);
3676 
3677   // Fix-up external users of the induction variables.
3678   for (auto &Entry : Legal->getInductionVars())
3679     fixupIVUsers(Entry.first, Entry.second,
3680                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3681                  IVEndValues[Entry.first], LoopMiddleBlock);
3682 
3683   fixLCSSAPHIs();
3684   for (Instruction *PI : PredicatedInstructions)
3685     sinkScalarOperands(&*PI);
3686 
3687   // Remove redundant induction instructions.
3688   cse(LoopVectorBody);
3689 
3690   // Set/update profile weights for the vector and remainder loops as original
3691   // loop iterations are now distributed among them. Note that original loop
3692   // represented by LoopScalarBody becomes remainder loop after vectorization.
3693   //
3694   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3695   // end up getting slightly roughened result but that should be OK since
3696   // profile is not inherently precise anyway. Note also possible bypass of
3697   // vector code caused by legality checks is ignored, assigning all the weight
3698   // to the vector loop, optimistically.
3699   assert(!VF.isScalable() &&
3700          "cannot use scalable ElementCount to determine unroll factor");
3701   setProfileInfoAfterUnrolling(
3702       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3703       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3704 }
3705 
3706 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3707   // In order to support recurrences we need to be able to vectorize Phi nodes.
3708   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3709   // stage #2: We now need to fix the recurrences by adding incoming edges to
3710   // the currently empty PHI nodes. At this point every instruction in the
3711   // original loop is widened to a vector form so we can use them to construct
3712   // the incoming edges.
3713   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3714     // Handle first-order recurrences and reductions that need to be fixed.
3715     if (Legal->isFirstOrderRecurrence(&Phi))
3716       fixFirstOrderRecurrence(&Phi);
3717     else if (Legal->isReductionVariable(&Phi))
3718       fixReduction(&Phi);
3719   }
3720 }
3721 
3722 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3723   // This is the second phase of vectorizing first-order recurrences. An
3724   // overview of the transformation is described below. Suppose we have the
3725   // following loop.
3726   //
3727   //   for (int i = 0; i < n; ++i)
3728   //     b[i] = a[i] - a[i - 1];
3729   //
3730   // There is a first-order recurrence on "a". For this loop, the shorthand
3731   // scalar IR looks like:
3732   //
3733   //   scalar.ph:
3734   //     s_init = a[-1]
3735   //     br scalar.body
3736   //
3737   //   scalar.body:
3738   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3739   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3740   //     s2 = a[i]
3741   //     b[i] = s2 - s1
3742   //     br cond, scalar.body, ...
3743   //
3744   // In this example, s1 is a recurrence because it's value depends on the
3745   // previous iteration. In the first phase of vectorization, we created a
3746   // temporary value for s1. We now complete the vectorization and produce the
3747   // shorthand vector IR shown below (for VF = 4, UF = 1).
3748   //
3749   //   vector.ph:
3750   //     v_init = vector(..., ..., ..., a[-1])
3751   //     br vector.body
3752   //
3753   //   vector.body
3754   //     i = phi [0, vector.ph], [i+4, vector.body]
3755   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3756   //     v2 = a[i, i+1, i+2, i+3];
3757   //     v3 = vector(v1(3), v2(0, 1, 2))
3758   //     b[i, i+1, i+2, i+3] = v2 - v3
3759   //     br cond, vector.body, middle.block
3760   //
3761   //   middle.block:
3762   //     x = v2(3)
3763   //     br scalar.ph
3764   //
3765   //   scalar.ph:
3766   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3767   //     br scalar.body
3768   //
3769   // After execution completes the vector loop, we extract the next value of
3770   // the recurrence (x) to use as the initial value in the scalar loop.
3771 
3772   // Get the original loop preheader and single loop latch.
3773   auto *Preheader = OrigLoop->getLoopPreheader();
3774   auto *Latch = OrigLoop->getLoopLatch();
3775 
3776   // Get the initial and previous values of the scalar recurrence.
3777   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3778   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3779 
3780   // Create a vector from the initial value.
3781   auto *VectorInit = ScalarInit;
3782   if (VF.isVector()) {
3783     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3784     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
3785     VectorInit = Builder.CreateInsertElement(
3786         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3787         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
3788   }
3789 
3790   // We constructed a temporary phi node in the first phase of vectorization.
3791   // This phi node will eventually be deleted.
3792   Builder.SetInsertPoint(
3793       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3794 
3795   // Create a phi node for the new recurrence. The current value will either be
3796   // the initial value inserted into a vector or loop-varying vector value.
3797   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3798   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3799 
3800   // Get the vectorized previous value of the last part UF - 1. It appears last
3801   // among all unrolled iterations, due to the order of their construction.
3802   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3803 
3804   // Find and set the insertion point after the previous value if it is an
3805   // instruction.
3806   BasicBlock::iterator InsertPt;
3807   // Note that the previous value may have been constant-folded so it is not
3808   // guaranteed to be an instruction in the vector loop.
3809   // FIXME: Loop invariant values do not form recurrences. We should deal with
3810   //        them earlier.
3811   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3812     InsertPt = LoopVectorBody->getFirstInsertionPt();
3813   else {
3814     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3815     if (isa<PHINode>(PreviousLastPart))
3816       // If the previous value is a phi node, we should insert after all the phi
3817       // nodes in the block containing the PHI to avoid breaking basic block
3818       // verification. Note that the basic block may be different to
3819       // LoopVectorBody, in case we predicate the loop.
3820       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3821     else
3822       InsertPt = ++PreviousInst->getIterator();
3823   }
3824   Builder.SetInsertPoint(&*InsertPt);
3825 
3826   // We will construct a vector for the recurrence by combining the values for
3827   // the current and previous iterations. This is the required shuffle mask.
3828   assert(!VF.isScalable());
3829   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
3830   ShuffleMask[0] = VF.getKnownMinValue() - 1;
3831   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
3832     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
3833 
3834   // The vector from which to take the initial value for the current iteration
3835   // (actual or unrolled). Initially, this is the vector phi node.
3836   Value *Incoming = VecPhi;
3837 
3838   // Shuffle the current and previous vector and update the vector parts.
3839   for (unsigned Part = 0; Part < UF; ++Part) {
3840     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3841     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3842     auto *Shuffle =
3843         VF.isVector()
3844             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
3845             : Incoming;
3846     PhiPart->replaceAllUsesWith(Shuffle);
3847     cast<Instruction>(PhiPart)->eraseFromParent();
3848     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3849     Incoming = PreviousPart;
3850   }
3851 
3852   // Fix the latch value of the new recurrence in the vector loop.
3853   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3854 
3855   // Extract the last vector element in the middle block. This will be the
3856   // initial value for the recurrence when jumping to the scalar loop.
3857   auto *ExtractForScalar = Incoming;
3858   if (VF.isVector()) {
3859     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3860     ExtractForScalar = Builder.CreateExtractElement(
3861         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
3862         "vector.recur.extract");
3863   }
3864   // Extract the second last element in the middle block if the
3865   // Phi is used outside the loop. We need to extract the phi itself
3866   // and not the last element (the phi update in the current iteration). This
3867   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3868   // when the scalar loop is not run at all.
3869   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3870   if (VF.isVector())
3871     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3872         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
3873         "vector.recur.extract.for.phi");
3874   // When loop is unrolled without vectorizing, initialize
3875   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3876   // `Incoming`. This is analogous to the vectorized case above: extracting the
3877   // second last element when VF > 1.
3878   else if (UF > 1)
3879     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3880 
3881   // Fix the initial value of the original recurrence in the scalar loop.
3882   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3883   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3884   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3885     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3886     Start->addIncoming(Incoming, BB);
3887   }
3888 
3889   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3890   Phi->setName("scalar.recur");
3891 
3892   // Finally, fix users of the recurrence outside the loop. The users will need
3893   // either the last value of the scalar recurrence or the last value of the
3894   // vector recurrence we extracted in the middle block. Since the loop is in
3895   // LCSSA form, we just need to find all the phi nodes for the original scalar
3896   // recurrence in the exit block, and then add an edge for the middle block.
3897   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3898     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3899       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3900     }
3901   }
3902 }
3903 
3904 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3905   Constant *Zero = Builder.getInt32(0);
3906 
3907   // Get it's reduction variable descriptor.
3908   assert(Legal->isReductionVariable(Phi) &&
3909          "Unable to find the reduction variable");
3910   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
3911 
3912   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3913   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3914   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3915   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3916     RdxDesc.getMinMaxRecurrenceKind();
3917   setDebugLocFromInst(Builder, ReductionStartValue);
3918   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
3919 
3920   // We need to generate a reduction vector from the incoming scalar.
3921   // To do so, we need to generate the 'identity' vector and override
3922   // one of the elements with the incoming scalar reduction. We need
3923   // to do it in the vector-loop preheader.
3924   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3925 
3926   // This is the vector-clone of the value that leaves the loop.
3927   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3928 
3929   // Find the reduction identity variable. Zero for addition, or, xor,
3930   // one for multiplication, -1 for And.
3931   Value *Identity;
3932   Value *VectorStart;
3933   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3934       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3935     // MinMax reduction have the start value as their identify.
3936     if (VF == 1 || IsInLoopReductionPhi) {
3937       VectorStart = Identity = ReductionStartValue;
3938     } else {
3939       VectorStart = Identity =
3940         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3941     }
3942   } else {
3943     // Handle other reduction kinds:
3944     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3945         RK, VecTy->getScalarType());
3946     if (VF == 1 || IsInLoopReductionPhi) {
3947       Identity = Iden;
3948       // This vector is the Identity vector where the first element is the
3949       // incoming scalar reduction.
3950       VectorStart = ReductionStartValue;
3951     } else {
3952       Identity = ConstantVector::getSplat(VF, Iden);
3953 
3954       // This vector is the Identity vector where the first element is the
3955       // incoming scalar reduction.
3956       VectorStart =
3957         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3958     }
3959   }
3960 
3961   // Wrap flags are in general invalid after vectorization, clear them.
3962   clearReductionWrapFlags(RdxDesc);
3963 
3964   // Fix the vector-loop phi.
3965 
3966   // Reductions do not have to start at zero. They can start with
3967   // any loop invariant values.
3968   BasicBlock *Latch = OrigLoop->getLoopLatch();
3969   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3970 
3971   for (unsigned Part = 0; Part < UF; ++Part) {
3972     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3973     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3974     // Make sure to add the reduction start value only to the
3975     // first unroll part.
3976     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3977     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3978     cast<PHINode>(VecRdxPhi)
3979       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3980   }
3981 
3982   // Before each round, move the insertion point right between
3983   // the PHIs and the values we are going to write.
3984   // This allows us to write both PHINodes and the extractelement
3985   // instructions.
3986   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3987 
3988   setDebugLocFromInst(Builder, LoopExitInst);
3989 
3990   // If tail is folded by masking, the vector value to leave the loop should be
3991   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3992   // instead of the former.
3993   if (Cost->foldTailByMasking()) {
3994     for (unsigned Part = 0; Part < UF; ++Part) {
3995       Value *VecLoopExitInst =
3996           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3997       Value *Sel = nullptr;
3998       for (User *U : VecLoopExitInst->users()) {
3999         if (isa<SelectInst>(U)) {
4000           assert(!Sel && "Reduction exit feeding two selects");
4001           Sel = U;
4002         } else
4003           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4004       }
4005       assert(Sel && "Reduction exit feeds no select");
4006       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4007 
4008       // If the target can create a predicated operator for the reduction at no
4009       // extra cost in the loop (for example a predicated vadd), it can be
4010       // cheaper for the select to remain in the loop than be sunk out of it,
4011       // and so use the select value for the phi instead of the old
4012       // LoopExitValue.
4013       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4014       if (PreferPredicatedReductionSelect ||
4015           TTI->preferPredicatedReductionSelect(
4016               RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()),
4017               Phi->getType(), TargetTransformInfo::ReductionFlags())) {
4018         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4019         VecRdxPhi->setIncomingValueForBlock(
4020             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4021       }
4022     }
4023   }
4024 
4025   // If the vector reduction can be performed in a smaller type, we truncate
4026   // then extend the loop exit value to enable InstCombine to evaluate the
4027   // entire expression in the smaller type.
4028   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4029     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4030     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4031     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4032     Builder.SetInsertPoint(
4033         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4034     VectorParts RdxParts(UF);
4035     for (unsigned Part = 0; Part < UF; ++Part) {
4036       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4037       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4038       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4039                                         : Builder.CreateZExt(Trunc, VecTy);
4040       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4041            UI != RdxParts[Part]->user_end();)
4042         if (*UI != Trunc) {
4043           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4044           RdxParts[Part] = Extnd;
4045         } else {
4046           ++UI;
4047         }
4048     }
4049     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4050     for (unsigned Part = 0; Part < UF; ++Part) {
4051       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4052       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4053     }
4054   }
4055 
4056   // Reduce all of the unrolled parts into a single vector.
4057   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4058   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4059 
4060   // The middle block terminator has already been assigned a DebugLoc here (the
4061   // OrigLoop's single latch terminator). We want the whole middle block to
4062   // appear to execute on this line because: (a) it is all compiler generated,
4063   // (b) these instructions are always executed after evaluating the latch
4064   // conditional branch, and (c) other passes may add new predecessors which
4065   // terminate on this line. This is the easiest way to ensure we don't
4066   // accidentally cause an extra step back into the loop while debugging.
4067   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4068   for (unsigned Part = 1; Part < UF; ++Part) {
4069     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4070     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4071       // Floating point operations had to be 'fast' to enable the reduction.
4072       ReducedPartRdx = addFastMathFlag(
4073           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4074                               ReducedPartRdx, "bin.rdx"),
4075           RdxDesc.getFastMathFlags());
4076     else
4077       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4078                                       RdxPart);
4079   }
4080 
4081   // Create the reduction after the loop. Note that inloop reductions create the
4082   // target reduction in the loop using a Reduction recipe.
4083   if (VF.isVector() && !IsInLoopReductionPhi) {
4084     bool NoNaN = Legal->hasFunNoNaNAttr();
4085     ReducedPartRdx =
4086         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4087     // If the reduction can be performed in a smaller type, we need to extend
4088     // the reduction to the wider type before we branch to the original loop.
4089     if (Phi->getType() != RdxDesc.getRecurrenceType())
4090       ReducedPartRdx =
4091         RdxDesc.isSigned()
4092         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4093         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4094   }
4095 
4096   // Create a phi node that merges control-flow from the backedge-taken check
4097   // block and the middle block.
4098   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4099                                         LoopScalarPreHeader->getTerminator());
4100   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4101     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4102   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4103 
4104   // Now, we need to fix the users of the reduction variable
4105   // inside and outside of the scalar remainder loop.
4106   // We know that the loop is in LCSSA form. We need to update the
4107   // PHI nodes in the exit blocks.
4108   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4109     // All PHINodes need to have a single entry edge, or two if
4110     // we already fixed them.
4111     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4112 
4113     // We found a reduction value exit-PHI. Update it with the
4114     // incoming bypass edge.
4115     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4116       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4117   } // end of the LCSSA phi scan.
4118 
4119     // Fix the scalar loop reduction variable with the incoming reduction sum
4120     // from the vector body and from the backedge value.
4121   int IncomingEdgeBlockIdx =
4122     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4123   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4124   // Pick the other block.
4125   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4126   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4127   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4128 }
4129 
4130 void InnerLoopVectorizer::clearReductionWrapFlags(
4131     RecurrenceDescriptor &RdxDesc) {
4132   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4133   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4134       RK != RecurrenceDescriptor::RK_IntegerMult)
4135     return;
4136 
4137   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4138   assert(LoopExitInstr && "null loop exit instruction");
4139   SmallVector<Instruction *, 8> Worklist;
4140   SmallPtrSet<Instruction *, 8> Visited;
4141   Worklist.push_back(LoopExitInstr);
4142   Visited.insert(LoopExitInstr);
4143 
4144   while (!Worklist.empty()) {
4145     Instruction *Cur = Worklist.pop_back_val();
4146     if (isa<OverflowingBinaryOperator>(Cur))
4147       for (unsigned Part = 0; Part < UF; ++Part) {
4148         Value *V = getOrCreateVectorValue(Cur, Part);
4149         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4150       }
4151 
4152     for (User *U : Cur->users()) {
4153       Instruction *UI = cast<Instruction>(U);
4154       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4155           Visited.insert(UI).second)
4156         Worklist.push_back(UI);
4157     }
4158   }
4159 }
4160 
4161 void InnerLoopVectorizer::fixLCSSAPHIs() {
4162   assert(!VF.isScalable() && "the code below assumes fixed width vectors");
4163   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4164     if (LCSSAPhi.getNumIncomingValues() == 1) {
4165       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4166       // Non-instruction incoming values will have only one value.
4167       unsigned LastLane = 0;
4168       if (isa<Instruction>(IncomingValue))
4169         LastLane = Cost->isUniformAfterVectorization(
4170                        cast<Instruction>(IncomingValue), VF)
4171                        ? 0
4172                        : VF.getKnownMinValue() - 1;
4173       // Can be a loop invariant incoming value or the last scalar value to be
4174       // extracted from the vectorized loop.
4175       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4176       Value *lastIncomingValue =
4177           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4178       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4179     }
4180   }
4181 }
4182 
4183 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4184   // The basic block and loop containing the predicated instruction.
4185   auto *PredBB = PredInst->getParent();
4186   auto *VectorLoop = LI->getLoopFor(PredBB);
4187 
4188   // Initialize a worklist with the operands of the predicated instruction.
4189   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4190 
4191   // Holds instructions that we need to analyze again. An instruction may be
4192   // reanalyzed if we don't yet know if we can sink it or not.
4193   SmallVector<Instruction *, 8> InstsToReanalyze;
4194 
4195   // Returns true if a given use occurs in the predicated block. Phi nodes use
4196   // their operands in their corresponding predecessor blocks.
4197   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4198     auto *I = cast<Instruction>(U.getUser());
4199     BasicBlock *BB = I->getParent();
4200     if (auto *Phi = dyn_cast<PHINode>(I))
4201       BB = Phi->getIncomingBlock(
4202           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4203     return BB == PredBB;
4204   };
4205 
4206   // Iteratively sink the scalarized operands of the predicated instruction
4207   // into the block we created for it. When an instruction is sunk, it's
4208   // operands are then added to the worklist. The algorithm ends after one pass
4209   // through the worklist doesn't sink a single instruction.
4210   bool Changed;
4211   do {
4212     // Add the instructions that need to be reanalyzed to the worklist, and
4213     // reset the changed indicator.
4214     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4215     InstsToReanalyze.clear();
4216     Changed = false;
4217 
4218     while (!Worklist.empty()) {
4219       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4220 
4221       // We can't sink an instruction if it is a phi node, is already in the
4222       // predicated block, is not in the loop, or may have side effects.
4223       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4224           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4225         continue;
4226 
4227       // It's legal to sink the instruction if all its uses occur in the
4228       // predicated block. Otherwise, there's nothing to do yet, and we may
4229       // need to reanalyze the instruction.
4230       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4231         InstsToReanalyze.push_back(I);
4232         continue;
4233       }
4234 
4235       // Move the instruction to the beginning of the predicated block, and add
4236       // it's operands to the worklist.
4237       I->moveBefore(&*PredBB->getFirstInsertionPt());
4238       Worklist.insert(I->op_begin(), I->op_end());
4239 
4240       // The sinking may have enabled other instructions to be sunk, so we will
4241       // need to iterate.
4242       Changed = true;
4243     }
4244   } while (Changed);
4245 }
4246 
4247 void InnerLoopVectorizer::fixNonInductionPHIs() {
4248   for (PHINode *OrigPhi : OrigPHIsToFix) {
4249     PHINode *NewPhi =
4250         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4251     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4252 
4253     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4254         predecessors(OrigPhi->getParent()));
4255     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4256         predecessors(NewPhi->getParent()));
4257     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4258            "Scalar and Vector BB should have the same number of predecessors");
4259 
4260     // The insertion point in Builder may be invalidated by the time we get
4261     // here. Force the Builder insertion point to something valid so that we do
4262     // not run into issues during insertion point restore in
4263     // getOrCreateVectorValue calls below.
4264     Builder.SetInsertPoint(NewPhi);
4265 
4266     // The predecessor order is preserved and we can rely on mapping between
4267     // scalar and vector block predecessors.
4268     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4269       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4270 
4271       // When looking up the new scalar/vector values to fix up, use incoming
4272       // values from original phi.
4273       Value *ScIncV =
4274           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4275 
4276       // Scalar incoming value may need a broadcast
4277       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4278       NewPhi->addIncoming(NewIncV, NewPredBB);
4279     }
4280   }
4281 }
4282 
4283 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
4284                                    unsigned UF, ElementCount VF,
4285                                    bool IsPtrLoopInvariant,
4286                                    SmallBitVector &IsIndexLoopInvariant,
4287                                    VPTransformState &State) {
4288   // Construct a vector GEP by widening the operands of the scalar GEP as
4289   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4290   // results in a vector of pointers when at least one operand of the GEP
4291   // is vector-typed. Thus, to keep the representation compact, we only use
4292   // vector-typed operands for loop-varying values.
4293 
4294   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4295     // If we are vectorizing, but the GEP has only loop-invariant operands,
4296     // the GEP we build (by only using vector-typed operands for
4297     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4298     // produce a vector of pointers, we need to either arbitrarily pick an
4299     // operand to broadcast, or broadcast a clone of the original GEP.
4300     // Here, we broadcast a clone of the original.
4301     //
4302     // TODO: If at some point we decide to scalarize instructions having
4303     //       loop-invariant operands, this special case will no longer be
4304     //       required. We would add the scalarization decision to
4305     //       collectLoopScalars() and teach getVectorValue() to broadcast
4306     //       the lane-zero scalar value.
4307     auto *Clone = Builder.Insert(GEP->clone());
4308     for (unsigned Part = 0; Part < UF; ++Part) {
4309       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4310       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4311       addMetadata(EntryPart, GEP);
4312     }
4313   } else {
4314     // If the GEP has at least one loop-varying operand, we are sure to
4315     // produce a vector of pointers. But if we are only unrolling, we want
4316     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4317     // produce with the code below will be scalar (if VF == 1) or vector
4318     // (otherwise). Note that for the unroll-only case, we still maintain
4319     // values in the vector mapping with initVector, as we do for other
4320     // instructions.
4321     for (unsigned Part = 0; Part < UF; ++Part) {
4322       // The pointer operand of the new GEP. If it's loop-invariant, we
4323       // won't broadcast it.
4324       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4325                                      : State.get(Operands.getOperand(0), Part);
4326 
4327       // Collect all the indices for the new GEP. If any index is
4328       // loop-invariant, we won't broadcast it.
4329       SmallVector<Value *, 4> Indices;
4330       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4331         VPValue *Operand = Operands.getOperand(I);
4332         if (IsIndexLoopInvariant[I - 1])
4333           Indices.push_back(State.get(Operand, {0, 0}));
4334         else
4335           Indices.push_back(State.get(Operand, Part));
4336       }
4337 
4338       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4339       // but it should be a vector, otherwise.
4340       auto *NewGEP =
4341           GEP->isInBounds()
4342               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4343                                           Indices)
4344               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4345       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4346              "NewGEP is not a pointer vector");
4347       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4348       addMetadata(NewGEP, GEP);
4349     }
4350   }
4351 }
4352 
4353 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4354                                               ElementCount VF) {
4355   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4356   PHINode *P = cast<PHINode>(PN);
4357   if (EnableVPlanNativePath) {
4358     // Currently we enter here in the VPlan-native path for non-induction
4359     // PHIs where all control flow is uniform. We simply widen these PHIs.
4360     // Create a vector phi with no operands - the vector phi operands will be
4361     // set at the end of vector code generation.
4362     Type *VecTy =
4363         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4364     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4365     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4366     OrigPHIsToFix.push_back(P);
4367 
4368     return;
4369   }
4370 
4371   assert(PN->getParent() == OrigLoop->getHeader() &&
4372          "Non-header phis should have been handled elsewhere");
4373 
4374   // In order to support recurrences we need to be able to vectorize Phi nodes.
4375   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4376   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4377   // this value when we vectorize all of the instructions that use the PHI.
4378   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4379     for (unsigned Part = 0; Part < UF; ++Part) {
4380       // This is phase one of vectorizing PHIs.
4381       bool ScalarPHI =
4382           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4383       Type *VecTy =
4384           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4385       Value *EntryPart = PHINode::Create(
4386           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4387       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4388     }
4389     return;
4390   }
4391 
4392   setDebugLocFromInst(Builder, P);
4393 
4394   // This PHINode must be an induction variable.
4395   // Make sure that we know about it.
4396   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4397 
4398   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4399   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4400 
4401   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4402   // which can be found from the original scalar operations.
4403   switch (II.getKind()) {
4404   case InductionDescriptor::IK_NoInduction:
4405     llvm_unreachable("Unknown induction");
4406   case InductionDescriptor::IK_IntInduction:
4407   case InductionDescriptor::IK_FpInduction:
4408     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4409   case InductionDescriptor::IK_PtrInduction: {
4410     // Handle the pointer induction variable case.
4411     assert(P->getType()->isPointerTy() && "Unexpected type.");
4412 
4413     if (Cost->isScalarAfterVectorization(P, VF)) {
4414       // This is the normalized GEP that starts counting at zero.
4415       Value *PtrInd =
4416           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4417       // Determine the number of scalars we need to generate for each unroll
4418       // iteration. If the instruction is uniform, we only need to generate the
4419       // first lane. Otherwise, we generate all VF values.
4420       unsigned Lanes =
4421           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4422       for (unsigned Part = 0; Part < UF; ++Part) {
4423         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4424           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4425                                            Lane + Part * VF.getKnownMinValue());
4426           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4427           Value *SclrGep =
4428               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4429           SclrGep->setName("next.gep");
4430           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4431         }
4432       }
4433       return;
4434     }
4435     assert(isa<SCEVConstant>(II.getStep()) &&
4436            "Induction step not a SCEV constant!");
4437     Type *PhiType = II.getStep()->getType();
4438 
4439     // Build a pointer phi
4440     Value *ScalarStartValue = II.getStartValue();
4441     Type *ScStValueType = ScalarStartValue->getType();
4442     PHINode *NewPointerPhi =
4443         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4444     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4445 
4446     // A pointer induction, performed by using a gep
4447     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4448     Instruction *InductionLoc = LoopLatch->getTerminator();
4449     const SCEV *ScalarStep = II.getStep();
4450     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4451     Value *ScalarStepValue =
4452         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4453     Value *InductionGEP = GetElementPtrInst::Create(
4454         ScStValueType->getPointerElementType(), NewPointerPhi,
4455         Builder.CreateMul(
4456             ScalarStepValue,
4457             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4458         "ptr.ind", InductionLoc);
4459     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4460 
4461     // Create UF many actual address geps that use the pointer
4462     // phi as base and a vectorized version of the step value
4463     // (<step*0, ..., step*N>) as offset.
4464     for (unsigned Part = 0; Part < UF; ++Part) {
4465       SmallVector<Constant *, 8> Indices;
4466       // Create a vector of consecutive numbers from zero to VF.
4467       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4468         Indices.push_back(
4469             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4470       Constant *StartOffset = ConstantVector::get(Indices);
4471 
4472       Value *GEP = Builder.CreateGEP(
4473           ScStValueType->getPointerElementType(), NewPointerPhi,
4474           Builder.CreateMul(
4475               StartOffset,
4476               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4477               "vector.gep"));
4478       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4479     }
4480   }
4481   }
4482 }
4483 
4484 /// A helper function for checking whether an integer division-related
4485 /// instruction may divide by zero (in which case it must be predicated if
4486 /// executed conditionally in the scalar code).
4487 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4488 /// Non-zero divisors that are non compile-time constants will not be
4489 /// converted into multiplication, so we will still end up scalarizing
4490 /// the division, but can do so w/o predication.
4491 static bool mayDivideByZero(Instruction &I) {
4492   assert((I.getOpcode() == Instruction::UDiv ||
4493           I.getOpcode() == Instruction::SDiv ||
4494           I.getOpcode() == Instruction::URem ||
4495           I.getOpcode() == Instruction::SRem) &&
4496          "Unexpected instruction");
4497   Value *Divisor = I.getOperand(1);
4498   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4499   return !CInt || CInt->isZero();
4500 }
4501 
4502 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
4503                                            VPTransformState &State) {
4504   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4505   switch (I.getOpcode()) {
4506   case Instruction::Call:
4507   case Instruction::Br:
4508   case Instruction::PHI:
4509   case Instruction::GetElementPtr:
4510   case Instruction::Select:
4511     llvm_unreachable("This instruction is handled by a different recipe.");
4512   case Instruction::UDiv:
4513   case Instruction::SDiv:
4514   case Instruction::SRem:
4515   case Instruction::URem:
4516   case Instruction::Add:
4517   case Instruction::FAdd:
4518   case Instruction::Sub:
4519   case Instruction::FSub:
4520   case Instruction::FNeg:
4521   case Instruction::Mul:
4522   case Instruction::FMul:
4523   case Instruction::FDiv:
4524   case Instruction::FRem:
4525   case Instruction::Shl:
4526   case Instruction::LShr:
4527   case Instruction::AShr:
4528   case Instruction::And:
4529   case Instruction::Or:
4530   case Instruction::Xor: {
4531     // Just widen unops and binops.
4532     setDebugLocFromInst(Builder, &I);
4533 
4534     for (unsigned Part = 0; Part < UF; ++Part) {
4535       SmallVector<Value *, 2> Ops;
4536       for (VPValue *VPOp : User.operands())
4537         Ops.push_back(State.get(VPOp, Part));
4538 
4539       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4540 
4541       if (auto *VecOp = dyn_cast<Instruction>(V))
4542         VecOp->copyIRFlags(&I);
4543 
4544       // Use this vector value for all users of the original instruction.
4545       VectorLoopValueMap.setVectorValue(&I, Part, V);
4546       addMetadata(V, &I);
4547     }
4548 
4549     break;
4550   }
4551   case Instruction::ICmp:
4552   case Instruction::FCmp: {
4553     // Widen compares. Generate vector compares.
4554     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4555     auto *Cmp = cast<CmpInst>(&I);
4556     setDebugLocFromInst(Builder, Cmp);
4557     for (unsigned Part = 0; Part < UF; ++Part) {
4558       Value *A = State.get(User.getOperand(0), Part);
4559       Value *B = State.get(User.getOperand(1), Part);
4560       Value *C = nullptr;
4561       if (FCmp) {
4562         // Propagate fast math flags.
4563         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4564         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4565         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4566       } else {
4567         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4568       }
4569       VectorLoopValueMap.setVectorValue(&I, Part, C);
4570       addMetadata(C, &I);
4571     }
4572 
4573     break;
4574   }
4575 
4576   case Instruction::ZExt:
4577   case Instruction::SExt:
4578   case Instruction::FPToUI:
4579   case Instruction::FPToSI:
4580   case Instruction::FPExt:
4581   case Instruction::PtrToInt:
4582   case Instruction::IntToPtr:
4583   case Instruction::SIToFP:
4584   case Instruction::UIToFP:
4585   case Instruction::Trunc:
4586   case Instruction::FPTrunc:
4587   case Instruction::BitCast: {
4588     auto *CI = cast<CastInst>(&I);
4589     setDebugLocFromInst(Builder, CI);
4590 
4591     /// Vectorize casts.
4592     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4593     Type *DestTy =
4594         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4595 
4596     for (unsigned Part = 0; Part < UF; ++Part) {
4597       Value *A = State.get(User.getOperand(0), Part);
4598       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4599       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4600       addMetadata(Cast, &I);
4601     }
4602     break;
4603   }
4604   default:
4605     // This instruction is not vectorized by simple widening.
4606     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4607     llvm_unreachable("Unhandled instruction!");
4608   } // end of switch.
4609 }
4610 
4611 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
4612                                                VPTransformState &State) {
4613   assert(!isa<DbgInfoIntrinsic>(I) &&
4614          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4615   setDebugLocFromInst(Builder, &I);
4616 
4617   Module *M = I.getParent()->getParent()->getParent();
4618   auto *CI = cast<CallInst>(&I);
4619 
4620   SmallVector<Type *, 4> Tys;
4621   for (Value *ArgOperand : CI->arg_operands())
4622     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4623 
4624   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4625 
4626   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4627   // version of the instruction.
4628   // Is it beneficial to perform intrinsic call compared to lib call?
4629   bool NeedToScalarize = false;
4630   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4631   bool UseVectorIntrinsic =
4632       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4633   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4634          "Instruction should be scalarized elsewhere.");
4635 
4636   for (unsigned Part = 0; Part < UF; ++Part) {
4637     SmallVector<Value *, 4> Args;
4638     for (auto &I : enumerate(ArgOperands.operands())) {
4639       // Some intrinsics have a scalar argument - don't replace it with a
4640       // vector.
4641       Value *Arg;
4642       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4643         Arg = State.get(I.value(), Part);
4644       else
4645         Arg = State.get(I.value(), {0, 0});
4646       Args.push_back(Arg);
4647     }
4648 
4649     Function *VectorF;
4650     if (UseVectorIntrinsic) {
4651       // Use vector version of the intrinsic.
4652       Type *TysForDecl[] = {CI->getType()};
4653       if (VF.isVector()) {
4654         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4655         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4656       }
4657       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4658       assert(VectorF && "Can't retrieve vector intrinsic.");
4659     } else {
4660       // Use vector version of the function call.
4661       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4662 #ifndef NDEBUG
4663       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4664              "Can't create vector function.");
4665 #endif
4666         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4667     }
4668       SmallVector<OperandBundleDef, 1> OpBundles;
4669       CI->getOperandBundlesAsDefs(OpBundles);
4670       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4671 
4672       if (isa<FPMathOperator>(V))
4673         V->copyFastMathFlags(CI);
4674 
4675       VectorLoopValueMap.setVectorValue(&I, Part, V);
4676       addMetadata(V, &I);
4677   }
4678 }
4679 
4680 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
4681                                                  VPUser &Operands,
4682                                                  bool InvariantCond,
4683                                                  VPTransformState &State) {
4684   setDebugLocFromInst(Builder, &I);
4685 
4686   // The condition can be loop invariant  but still defined inside the
4687   // loop. This means that we can't just use the original 'cond' value.
4688   // We have to take the 'vectorized' value and pick the first lane.
4689   // Instcombine will make this a no-op.
4690   auto *InvarCond =
4691       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4692 
4693   for (unsigned Part = 0; Part < UF; ++Part) {
4694     Value *Cond =
4695         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4696     Value *Op0 = State.get(Operands.getOperand(1), Part);
4697     Value *Op1 = State.get(Operands.getOperand(2), Part);
4698     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4699     VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4700     addMetadata(Sel, &I);
4701   }
4702 }
4703 
4704 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4705   // We should not collect Scalars more than once per VF. Right now, this
4706   // function is called from collectUniformsAndScalars(), which already does
4707   // this check. Collecting Scalars for VF=1 does not make any sense.
4708   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4709          "This function should not be visited twice for the same VF");
4710 
4711   SmallSetVector<Instruction *, 8> Worklist;
4712 
4713   // These sets are used to seed the analysis with pointers used by memory
4714   // accesses that will remain scalar.
4715   SmallSetVector<Instruction *, 8> ScalarPtrs;
4716   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4717   auto *Latch = TheLoop->getLoopLatch();
4718 
4719   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4720   // The pointer operands of loads and stores will be scalar as long as the
4721   // memory access is not a gather or scatter operation. The value operand of a
4722   // store will remain scalar if the store is scalarized.
4723   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4724     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4725     assert(WideningDecision != CM_Unknown &&
4726            "Widening decision should be ready at this moment");
4727     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4728       if (Ptr == Store->getValueOperand())
4729         return WideningDecision == CM_Scalarize;
4730     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4731            "Ptr is neither a value or pointer operand");
4732     return WideningDecision != CM_GatherScatter;
4733   };
4734 
4735   // A helper that returns true if the given value is a bitcast or
4736   // getelementptr instruction contained in the loop.
4737   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4738     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4739             isa<GetElementPtrInst>(V)) &&
4740            !TheLoop->isLoopInvariant(V);
4741   };
4742 
4743   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4744     if (!isa<PHINode>(Ptr) ||
4745         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4746       return false;
4747     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4748     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4749       return false;
4750     return isScalarUse(MemAccess, Ptr);
4751   };
4752 
4753   // A helper that evaluates a memory access's use of a pointer. If the
4754   // pointer is actually the pointer induction of a loop, it is being
4755   // inserted into Worklist. If the use will be a scalar use, and the
4756   // pointer is only used by memory accesses, we place the pointer in
4757   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4758   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4759     if (isScalarPtrInduction(MemAccess, Ptr)) {
4760       Worklist.insert(cast<Instruction>(Ptr));
4761       Instruction *Update = cast<Instruction>(
4762           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4763       Worklist.insert(Update);
4764       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4765                         << "\n");
4766       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4767                         << "\n");
4768       return;
4769     }
4770     // We only care about bitcast and getelementptr instructions contained in
4771     // the loop.
4772     if (!isLoopVaryingBitCastOrGEP(Ptr))
4773       return;
4774 
4775     // If the pointer has already been identified as scalar (e.g., if it was
4776     // also identified as uniform), there's nothing to do.
4777     auto *I = cast<Instruction>(Ptr);
4778     if (Worklist.count(I))
4779       return;
4780 
4781     // If the use of the pointer will be a scalar use, and all users of the
4782     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4783     // place the pointer in PossibleNonScalarPtrs.
4784     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4785           return isa<LoadInst>(U) || isa<StoreInst>(U);
4786         }))
4787       ScalarPtrs.insert(I);
4788     else
4789       PossibleNonScalarPtrs.insert(I);
4790   };
4791 
4792   // We seed the scalars analysis with three classes of instructions: (1)
4793   // instructions marked uniform-after-vectorization and (2) bitcast,
4794   // getelementptr and (pointer) phi instructions used by memory accesses
4795   // requiring a scalar use.
4796   //
4797   // (1) Add to the worklist all instructions that have been identified as
4798   // uniform-after-vectorization.
4799   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4800 
4801   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4802   // memory accesses requiring a scalar use. The pointer operands of loads and
4803   // stores will be scalar as long as the memory accesses is not a gather or
4804   // scatter operation. The value operand of a store will remain scalar if the
4805   // store is scalarized.
4806   for (auto *BB : TheLoop->blocks())
4807     for (auto &I : *BB) {
4808       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4809         evaluatePtrUse(Load, Load->getPointerOperand());
4810       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4811         evaluatePtrUse(Store, Store->getPointerOperand());
4812         evaluatePtrUse(Store, Store->getValueOperand());
4813       }
4814     }
4815   for (auto *I : ScalarPtrs)
4816     if (!PossibleNonScalarPtrs.count(I)) {
4817       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4818       Worklist.insert(I);
4819     }
4820 
4821   // Insert the forced scalars.
4822   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4823   // induction variable when the PHI user is scalarized.
4824   auto ForcedScalar = ForcedScalars.find(VF);
4825   if (ForcedScalar != ForcedScalars.end())
4826     for (auto *I : ForcedScalar->second)
4827       Worklist.insert(I);
4828 
4829   // Expand the worklist by looking through any bitcasts and getelementptr
4830   // instructions we've already identified as scalar. This is similar to the
4831   // expansion step in collectLoopUniforms(); however, here we're only
4832   // expanding to include additional bitcasts and getelementptr instructions.
4833   unsigned Idx = 0;
4834   while (Idx != Worklist.size()) {
4835     Instruction *Dst = Worklist[Idx++];
4836     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4837       continue;
4838     auto *Src = cast<Instruction>(Dst->getOperand(0));
4839     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4840           auto *J = cast<Instruction>(U);
4841           return !TheLoop->contains(J) || Worklist.count(J) ||
4842                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4843                   isScalarUse(J, Src));
4844         })) {
4845       Worklist.insert(Src);
4846       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4847     }
4848   }
4849 
4850   // An induction variable will remain scalar if all users of the induction
4851   // variable and induction variable update remain scalar.
4852   for (auto &Induction : Legal->getInductionVars()) {
4853     auto *Ind = Induction.first;
4854     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4855 
4856     // If tail-folding is applied, the primary induction variable will be used
4857     // to feed a vector compare.
4858     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4859       continue;
4860 
4861     // Determine if all users of the induction variable are scalar after
4862     // vectorization.
4863     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4864       auto *I = cast<Instruction>(U);
4865       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4866     });
4867     if (!ScalarInd)
4868       continue;
4869 
4870     // Determine if all users of the induction variable update instruction are
4871     // scalar after vectorization.
4872     auto ScalarIndUpdate =
4873         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4874           auto *I = cast<Instruction>(U);
4875           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4876         });
4877     if (!ScalarIndUpdate)
4878       continue;
4879 
4880     // The induction variable and its update instruction will remain scalar.
4881     Worklist.insert(Ind);
4882     Worklist.insert(IndUpdate);
4883     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4884     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4885                       << "\n");
4886   }
4887 
4888   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4889 }
4890 
4891 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
4892                                                          ElementCount VF) {
4893   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4894   if (!blockNeedsPredication(I->getParent()))
4895     return false;
4896   switch(I->getOpcode()) {
4897   default:
4898     break;
4899   case Instruction::Load:
4900   case Instruction::Store: {
4901     if (!Legal->isMaskRequired(I))
4902       return false;
4903     auto *Ptr = getLoadStorePointerOperand(I);
4904     auto *Ty = getMemInstValueType(I);
4905     // We have already decided how to vectorize this instruction, get that
4906     // result.
4907     if (VF.isVector()) {
4908       InstWidening WideningDecision = getWideningDecision(I, VF);
4909       assert(WideningDecision != CM_Unknown &&
4910              "Widening decision should be ready at this moment");
4911       return WideningDecision == CM_Scalarize;
4912     }
4913     const Align Alignment = getLoadStoreAlignment(I);
4914     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4915                                 isLegalMaskedGather(Ty, Alignment))
4916                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4917                                 isLegalMaskedScatter(Ty, Alignment));
4918   }
4919   case Instruction::UDiv:
4920   case Instruction::SDiv:
4921   case Instruction::SRem:
4922   case Instruction::URem:
4923     return mayDivideByZero(*I);
4924   }
4925   return false;
4926 }
4927 
4928 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4929     Instruction *I, ElementCount VF) {
4930   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4931   assert(getWideningDecision(I, VF) == CM_Unknown &&
4932          "Decision should not be set yet.");
4933   auto *Group = getInterleavedAccessGroup(I);
4934   assert(Group && "Must have a group.");
4935 
4936   // If the instruction's allocated size doesn't equal it's type size, it
4937   // requires padding and will be scalarized.
4938   auto &DL = I->getModule()->getDataLayout();
4939   auto *ScalarTy = getMemInstValueType(I);
4940   if (hasIrregularType(ScalarTy, DL, VF))
4941     return false;
4942 
4943   // Check if masking is required.
4944   // A Group may need masking for one of two reasons: it resides in a block that
4945   // needs predication, or it was decided to use masking to deal with gaps.
4946   bool PredicatedAccessRequiresMasking =
4947       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4948   bool AccessWithGapsRequiresMasking =
4949       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4950   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4951     return true;
4952 
4953   // If masked interleaving is required, we expect that the user/target had
4954   // enabled it, because otherwise it either wouldn't have been created or
4955   // it should have been invalidated by the CostModel.
4956   assert(useMaskedInterleavedAccesses(TTI) &&
4957          "Masked interleave-groups for predicated accesses are not enabled.");
4958 
4959   auto *Ty = getMemInstValueType(I);
4960   const Align Alignment = getLoadStoreAlignment(I);
4961   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4962                           : TTI.isLegalMaskedStore(Ty, Alignment);
4963 }
4964 
4965 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4966     Instruction *I, ElementCount VF) {
4967   // Get and ensure we have a valid memory instruction.
4968   LoadInst *LI = dyn_cast<LoadInst>(I);
4969   StoreInst *SI = dyn_cast<StoreInst>(I);
4970   assert((LI || SI) && "Invalid memory instruction");
4971 
4972   auto *Ptr = getLoadStorePointerOperand(I);
4973 
4974   // In order to be widened, the pointer should be consecutive, first of all.
4975   if (!Legal->isConsecutivePtr(Ptr))
4976     return false;
4977 
4978   // If the instruction is a store located in a predicated block, it will be
4979   // scalarized.
4980   if (isScalarWithPredication(I))
4981     return false;
4982 
4983   // If the instruction's allocated size doesn't equal it's type size, it
4984   // requires padding and will be scalarized.
4985   auto &DL = I->getModule()->getDataLayout();
4986   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4987   if (hasIrregularType(ScalarTy, DL, VF))
4988     return false;
4989 
4990   return true;
4991 }
4992 
4993 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4994   // We should not collect Uniforms more than once per VF. Right now,
4995   // this function is called from collectUniformsAndScalars(), which
4996   // already does this check. Collecting Uniforms for VF=1 does not make any
4997   // sense.
4998 
4999   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5000          "This function should not be visited twice for the same VF");
5001 
5002   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5003   // not analyze again.  Uniforms.count(VF) will return 1.
5004   Uniforms[VF].clear();
5005 
5006   // We now know that the loop is vectorizable!
5007   // Collect instructions inside the loop that will remain uniform after
5008   // vectorization.
5009 
5010   // Global values, params and instructions outside of current loop are out of
5011   // scope.
5012   auto isOutOfScope = [&](Value *V) -> bool {
5013     Instruction *I = dyn_cast<Instruction>(V);
5014     return (!I || !TheLoop->contains(I));
5015   };
5016 
5017   SetVector<Instruction *> Worklist;
5018   BasicBlock *Latch = TheLoop->getLoopLatch();
5019 
5020   // Instructions that are scalar with predication must not be considered
5021   // uniform after vectorization, because that would create an erroneous
5022   // replicating region where only a single instance out of VF should be formed.
5023   // TODO: optimize such seldom cases if found important, see PR40816.
5024   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5025     if (isScalarWithPredication(I, VF)) {
5026       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5027                         << *I << "\n");
5028       return;
5029     }
5030     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5031     Worklist.insert(I);
5032   };
5033 
5034   // Start with the conditional branch. If the branch condition is an
5035   // instruction contained in the loop that is only used by the branch, it is
5036   // uniform.
5037   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5038   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5039     addToWorklistIfAllowed(Cmp);
5040 
5041   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
5042   // are pointers that are treated like consecutive pointers during
5043   // vectorization. The pointer operands of interleaved accesses are an
5044   // example.
5045   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
5046 
5047   // Holds pointer operands of instructions that are possibly non-uniform.
5048   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
5049 
5050   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5051     InstWidening WideningDecision = getWideningDecision(I, VF);
5052     assert(WideningDecision != CM_Unknown &&
5053            "Widening decision should be ready at this moment");
5054 
5055     return (WideningDecision == CM_Widen ||
5056             WideningDecision == CM_Widen_Reverse ||
5057             WideningDecision == CM_Interleave);
5058   };
5059   // Iterate over the instructions in the loop, and collect all
5060   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
5061   // that a consecutive-like pointer operand will be scalarized, we collect it
5062   // in PossibleNonUniformPtrs instead. We use two sets here because a single
5063   // getelementptr instruction can be used by both vectorized and scalarized
5064   // memory instructions. For example, if a loop loads and stores from the same
5065   // location, but the store is conditional, the store will be scalarized, and
5066   // the getelementptr won't remain uniform.
5067   for (auto *BB : TheLoop->blocks())
5068     for (auto &I : *BB) {
5069       // If there's no pointer operand, there's nothing to do.
5070       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
5071       if (!Ptr)
5072         continue;
5073 
5074       // True if all users of Ptr are memory accesses that have Ptr as their
5075       // pointer operand.
5076       auto UsersAreMemAccesses =
5077           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
5078             return getLoadStorePointerOperand(U) == Ptr;
5079           });
5080 
5081       // Ensure the memory instruction will not be scalarized or used by
5082       // gather/scatter, making its pointer operand non-uniform. If the pointer
5083       // operand is used by any instruction other than a memory access, we
5084       // conservatively assume the pointer operand may be non-uniform.
5085       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
5086         PossibleNonUniformPtrs.insert(Ptr);
5087 
5088       // If the memory instruction will be vectorized and its pointer operand
5089       // is consecutive-like, or interleaving - the pointer operand should
5090       // remain uniform.
5091       else
5092         ConsecutiveLikePtrs.insert(Ptr);
5093     }
5094 
5095   // Add to the Worklist all consecutive and consecutive-like pointers that
5096   // aren't also identified as possibly non-uniform.
5097   for (auto *V : ConsecutiveLikePtrs)
5098     if (!PossibleNonUniformPtrs.count(V))
5099       addToWorklistIfAllowed(V);
5100 
5101   // Expand Worklist in topological order: whenever a new instruction
5102   // is added , its users should be already inside Worklist.  It ensures
5103   // a uniform instruction will only be used by uniform instructions.
5104   unsigned idx = 0;
5105   while (idx != Worklist.size()) {
5106     Instruction *I = Worklist[idx++];
5107 
5108     for (auto OV : I->operand_values()) {
5109       // isOutOfScope operands cannot be uniform instructions.
5110       if (isOutOfScope(OV))
5111         continue;
5112       // First order recurrence Phi's should typically be considered
5113       // non-uniform.
5114       auto *OP = dyn_cast<PHINode>(OV);
5115       if (OP && Legal->isFirstOrderRecurrence(OP))
5116         continue;
5117       // If all the users of the operand are uniform, then add the
5118       // operand into the uniform worklist.
5119       auto *OI = cast<Instruction>(OV);
5120       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5121             auto *J = cast<Instruction>(U);
5122             return Worklist.count(J) ||
5123                    (OI == getLoadStorePointerOperand(J) &&
5124                     isUniformDecision(J, VF));
5125           }))
5126         addToWorklistIfAllowed(OI);
5127     }
5128   }
5129 
5130   // Returns true if Ptr is the pointer operand of a memory access instruction
5131   // I, and I is known to not require scalarization.
5132   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5133     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5134   };
5135 
5136   // For an instruction to be added into Worklist above, all its users inside
5137   // the loop should also be in Worklist. However, this condition cannot be
5138   // true for phi nodes that form a cyclic dependence. We must process phi
5139   // nodes separately. An induction variable will remain uniform if all users
5140   // of the induction variable and induction variable update remain uniform.
5141   // The code below handles both pointer and non-pointer induction variables.
5142   for (auto &Induction : Legal->getInductionVars()) {
5143     auto *Ind = Induction.first;
5144     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5145 
5146     // Determine if all users of the induction variable are uniform after
5147     // vectorization.
5148     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5149       auto *I = cast<Instruction>(U);
5150       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5151              isVectorizedMemAccessUse(I, Ind);
5152     });
5153     if (!UniformInd)
5154       continue;
5155 
5156     // Determine if all users of the induction variable update instruction are
5157     // uniform after vectorization.
5158     auto UniformIndUpdate =
5159         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5160           auto *I = cast<Instruction>(U);
5161           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5162                  isVectorizedMemAccessUse(I, IndUpdate);
5163         });
5164     if (!UniformIndUpdate)
5165       continue;
5166 
5167     // The induction variable and its update instruction will remain uniform.
5168     addToWorklistIfAllowed(Ind);
5169     addToWorklistIfAllowed(IndUpdate);
5170   }
5171 
5172   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5173 }
5174 
5175 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5176   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5177 
5178   if (Legal->getRuntimePointerChecking()->Need) {
5179     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5180         "runtime pointer checks needed. Enable vectorization of this "
5181         "loop with '#pragma clang loop vectorize(enable)' when "
5182         "compiling with -Os/-Oz",
5183         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5184     return true;
5185   }
5186 
5187   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5188     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5189         "runtime SCEV checks needed. Enable vectorization of this "
5190         "loop with '#pragma clang loop vectorize(enable)' when "
5191         "compiling with -Os/-Oz",
5192         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5193     return true;
5194   }
5195 
5196   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5197   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5198     reportVectorizationFailure("Runtime stride check for small trip count",
5199         "runtime stride == 1 checks needed. Enable vectorization of "
5200         "this loop without such check by compiling with -Os/-Oz",
5201         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5202     return true;
5203   }
5204 
5205   return false;
5206 }
5207 
5208 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
5209                                                             unsigned UserIC) {
5210   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5211     // TODO: It may by useful to do since it's still likely to be dynamically
5212     // uniform if the target can skip.
5213     reportVectorizationFailure(
5214         "Not inserting runtime ptr check for divergent target",
5215         "runtime pointer checks needed. Not enabled for divergent target",
5216         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5217     return None;
5218   }
5219 
5220   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5221   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5222   if (TC == 1) {
5223     reportVectorizationFailure("Single iteration (non) loop",
5224         "loop trip count is one, irrelevant for vectorization",
5225         "SingleIterationLoop", ORE, TheLoop);
5226     return None;
5227   }
5228 
5229   switch (ScalarEpilogueStatus) {
5230   case CM_ScalarEpilogueAllowed:
5231     return UserVF ? UserVF : computeFeasibleMaxVF(TC);
5232   case CM_ScalarEpilogueNotNeededUsePredicate:
5233     LLVM_DEBUG(
5234         dbgs() << "LV: vector predicate hint/switch found.\n"
5235                << "LV: Not allowing scalar epilogue, creating predicated "
5236                << "vector loop.\n");
5237     break;
5238   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5239     // fallthrough as a special case of OptForSize
5240   case CM_ScalarEpilogueNotAllowedOptSize:
5241     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5242       LLVM_DEBUG(
5243           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5244     else
5245       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5246                         << "count.\n");
5247 
5248     // Bail if runtime checks are required, which are not good when optimising
5249     // for size.
5250     if (runtimeChecksRequired())
5251       return None;
5252     break;
5253   }
5254 
5255   // Now try the tail folding
5256 
5257   // Invalidate interleave groups that require an epilogue if we can't mask
5258   // the interleave-group.
5259   if (!useMaskedInterleavedAccesses(TTI)) {
5260     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5261            "No decisions should have been taken at this point");
5262     // Note: There is no need to invalidate any cost modeling decisions here, as
5263     // non where taken so far.
5264     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5265   }
5266 
5267   unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
5268   assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
5269   unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
5270   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5271     // Accept MaxVF if we do not have a tail.
5272     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5273     return MaxVF;
5274   }
5275 
5276   // If we don't know the precise trip count, or if the trip count that we
5277   // found modulo the vectorization factor is not zero, try to fold the tail
5278   // by masking.
5279   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5280   if (Legal->prepareToFoldTailByMasking()) {
5281     FoldTailByMasking = true;
5282     return MaxVF;
5283   }
5284 
5285   // If there was a tail-folding hint/switch, but we can't fold the tail by
5286   // masking, fallback to a vectorization with a scalar epilogue.
5287   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5288     if (PreferPredicateOverEpilogue == PreferPredicateTy::PredicateOrDontVectorize) {
5289       LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5290       return None;
5291     }
5292     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5293                          "scalar epilogue instead.\n");
5294     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5295     return MaxVF;
5296   }
5297 
5298   if (TC == 0) {
5299     reportVectorizationFailure(
5300         "Unable to calculate the loop count due to complex control flow",
5301         "unable to calculate the loop count due to complex control flow",
5302         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5303     return None;
5304   }
5305 
5306   reportVectorizationFailure(
5307       "Cannot optimize for size and vectorize at the same time.",
5308       "cannot optimize for size and vectorize at the same time. "
5309       "Enable vectorization of this loop with '#pragma clang loop "
5310       "vectorize(enable)' when compiling with -Os/-Oz",
5311       "NoTailLoopWithOptForSize", ORE, TheLoop);
5312   return None;
5313 }
5314 
5315 unsigned
5316 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5317   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5318   unsigned SmallestType, WidestType;
5319   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5320   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5321 
5322   // Get the maximum safe dependence distance in bits computed by LAA.
5323   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5324   // the memory accesses that is most restrictive (involved in the smallest
5325   // dependence distance).
5326   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5327 
5328   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5329 
5330   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5331   // Note that both WidestRegister and WidestType may not be a powers of 2.
5332   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5333 
5334   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5335                     << " / " << WidestType << " bits.\n");
5336   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5337                     << WidestRegister << " bits.\n");
5338 
5339   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5340                                  " into one vector!");
5341   if (MaxVectorSize == 0) {
5342     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5343     MaxVectorSize = 1;
5344     return MaxVectorSize;
5345   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5346              isPowerOf2_32(ConstTripCount)) {
5347     // We need to clamp the VF to be the ConstTripCount. There is no point in
5348     // choosing a higher viable VF as done in the loop below.
5349     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5350                       << ConstTripCount << "\n");
5351     MaxVectorSize = ConstTripCount;
5352     return MaxVectorSize;
5353   }
5354 
5355   unsigned MaxVF = MaxVectorSize;
5356   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5357       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5358     // Collect all viable vectorization factors larger than the default MaxVF
5359     // (i.e. MaxVectorSize).
5360     SmallVector<ElementCount, 8> VFs;
5361     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5362     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5363       VFs.push_back(ElementCount::getFixed(VS));
5364 
5365     // For each VF calculate its register usage.
5366     auto RUs = calculateRegisterUsage(VFs);
5367 
5368     // Select the largest VF which doesn't require more registers than existing
5369     // ones.
5370     for (int i = RUs.size() - 1; i >= 0; --i) {
5371       bool Selected = true;
5372       for (auto& pair : RUs[i].MaxLocalUsers) {
5373         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5374         if (pair.second > TargetNumRegisters)
5375           Selected = false;
5376       }
5377       if (Selected) {
5378         MaxVF = VFs[i].getKnownMinValue();
5379         break;
5380       }
5381     }
5382     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5383       if (MaxVF < MinVF) {
5384         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5385                           << ") with target's minimum: " << MinVF << '\n');
5386         MaxVF = MinVF;
5387       }
5388     }
5389   }
5390   return MaxVF;
5391 }
5392 
5393 VectorizationFactor
5394 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5395   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5396   const float ScalarCost = Cost;
5397   unsigned Width = 1;
5398   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5399 
5400   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5401   if (ForceVectorization && MaxVF > 1) {
5402     // Ignore scalar width, because the user explicitly wants vectorization.
5403     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5404     // evaluation.
5405     Cost = std::numeric_limits<float>::max();
5406   }
5407 
5408   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5409     // Notice that the vector loop needs to be executed less times, so
5410     // we need to divide the cost of the vector loops by the width of
5411     // the vector elements.
5412     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5413     float VectorCost = C.first / (float)i;
5414     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5415                       << " costs: " << (int)VectorCost << ".\n");
5416     if (!C.second && !ForceVectorization) {
5417       LLVM_DEBUG(
5418           dbgs() << "LV: Not considering vector loop of width " << i
5419                  << " because it will not generate any vector instructions.\n");
5420       continue;
5421     }
5422     if (VectorCost < Cost) {
5423       Cost = VectorCost;
5424       Width = i;
5425     }
5426   }
5427 
5428   if (!EnableCondStoresVectorization && NumPredStores) {
5429     reportVectorizationFailure("There are conditional stores.",
5430         "store that is conditionally executed prevents vectorization",
5431         "ConditionalStore", ORE, TheLoop);
5432     Width = 1;
5433     Cost = ScalarCost;
5434   }
5435 
5436   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5437              << "LV: Vectorization seems to be not beneficial, "
5438              << "but was forced by a user.\n");
5439   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5440   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5441                                 (unsigned)(Width * Cost)};
5442   return Factor;
5443 }
5444 
5445 std::pair<unsigned, unsigned>
5446 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5447   unsigned MinWidth = -1U;
5448   unsigned MaxWidth = 8;
5449   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5450 
5451   // For each block.
5452   for (BasicBlock *BB : TheLoop->blocks()) {
5453     // For each instruction in the loop.
5454     for (Instruction &I : BB->instructionsWithoutDebug()) {
5455       Type *T = I.getType();
5456 
5457       // Skip ignored values.
5458       if (ValuesToIgnore.count(&I))
5459         continue;
5460 
5461       // Only examine Loads, Stores and PHINodes.
5462       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5463         continue;
5464 
5465       // Examine PHI nodes that are reduction variables. Update the type to
5466       // account for the recurrence type.
5467       if (auto *PN = dyn_cast<PHINode>(&I)) {
5468         if (!Legal->isReductionVariable(PN))
5469           continue;
5470         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5471         T = RdxDesc.getRecurrenceType();
5472       }
5473 
5474       // Examine the stored values.
5475       if (auto *ST = dyn_cast<StoreInst>(&I))
5476         T = ST->getValueOperand()->getType();
5477 
5478       // Ignore loaded pointer types and stored pointer types that are not
5479       // vectorizable.
5480       //
5481       // FIXME: The check here attempts to predict whether a load or store will
5482       //        be vectorized. We only know this for certain after a VF has
5483       //        been selected. Here, we assume that if an access can be
5484       //        vectorized, it will be. We should also look at extending this
5485       //        optimization to non-pointer types.
5486       //
5487       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5488           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5489         continue;
5490 
5491       MinWidth = std::min(MinWidth,
5492                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5493       MaxWidth = std::max(MaxWidth,
5494                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5495     }
5496   }
5497 
5498   return {MinWidth, MaxWidth};
5499 }
5500 
5501 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5502                                                            unsigned LoopCost) {
5503   // -- The interleave heuristics --
5504   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5505   // There are many micro-architectural considerations that we can't predict
5506   // at this level. For example, frontend pressure (on decode or fetch) due to
5507   // code size, or the number and capabilities of the execution ports.
5508   //
5509   // We use the following heuristics to select the interleave count:
5510   // 1. If the code has reductions, then we interleave to break the cross
5511   // iteration dependency.
5512   // 2. If the loop is really small, then we interleave to reduce the loop
5513   // overhead.
5514   // 3. We don't interleave if we think that we will spill registers to memory
5515   // due to the increased register pressure.
5516 
5517   if (!isScalarEpilogueAllowed())
5518     return 1;
5519 
5520   // We used the distance for the interleave count.
5521   if (Legal->getMaxSafeDepDistBytes() != -1U)
5522     return 1;
5523 
5524   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5525   const bool HasReductions = !Legal->getReductionVars().empty();
5526   // Do not interleave loops with a relatively small known or estimated trip
5527   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5528   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5529   // because with the above conditions interleaving can expose ILP and break
5530   // cross iteration dependences for reductions.
5531   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5532       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5533     return 1;
5534 
5535   RegisterUsage R = calculateRegisterUsage({VF})[0];
5536   // We divide by these constants so assume that we have at least one
5537   // instruction that uses at least one register.
5538   for (auto& pair : R.MaxLocalUsers) {
5539     pair.second = std::max(pair.second, 1U);
5540   }
5541 
5542   // We calculate the interleave count using the following formula.
5543   // Subtract the number of loop invariants from the number of available
5544   // registers. These registers are used by all of the interleaved instances.
5545   // Next, divide the remaining registers by the number of registers that is
5546   // required by the loop, in order to estimate how many parallel instances
5547   // fit without causing spills. All of this is rounded down if necessary to be
5548   // a power of two. We want power of two interleave count to simplify any
5549   // addressing operations or alignment considerations.
5550   // We also want power of two interleave counts to ensure that the induction
5551   // variable of the vector loop wraps to zero, when tail is folded by masking;
5552   // this currently happens when OptForSize, in which case IC is set to 1 above.
5553   unsigned IC = UINT_MAX;
5554 
5555   for (auto& pair : R.MaxLocalUsers) {
5556     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5557     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5558                       << " registers of "
5559                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5560     if (VF.isScalar()) {
5561       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5562         TargetNumRegisters = ForceTargetNumScalarRegs;
5563     } else {
5564       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5565         TargetNumRegisters = ForceTargetNumVectorRegs;
5566     }
5567     unsigned MaxLocalUsers = pair.second;
5568     unsigned LoopInvariantRegs = 0;
5569     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5570       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5571 
5572     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5573     // Don't count the induction variable as interleaved.
5574     if (EnableIndVarRegisterHeur) {
5575       TmpIC =
5576           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5577                         std::max(1U, (MaxLocalUsers - 1)));
5578     }
5579 
5580     IC = std::min(IC, TmpIC);
5581   }
5582 
5583   // Clamp the interleave ranges to reasonable counts.
5584   assert(!VF.isScalable() && "scalable vectors not yet supported.");
5585   unsigned MaxInterleaveCount =
5586       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5587 
5588   // Check if the user has overridden the max.
5589   if (VF.isScalar()) {
5590     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5591       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5592   } else {
5593     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5594       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5595   }
5596 
5597   // If trip count is known or estimated compile time constant, limit the
5598   // interleave count to be less than the trip count divided by VF.
5599   if (BestKnownTC) {
5600     MaxInterleaveCount =
5601         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5602   }
5603 
5604   // If we did not calculate the cost for VF (because the user selected the VF)
5605   // then we calculate the cost of VF here.
5606   if (LoopCost == 0)
5607     LoopCost = expectedCost(VF).first;
5608 
5609   assert(LoopCost && "Non-zero loop cost expected");
5610 
5611   // Clamp the calculated IC to be between the 1 and the max interleave count
5612   // that the target and trip count allows.
5613   if (IC > MaxInterleaveCount)
5614     IC = MaxInterleaveCount;
5615   else if (IC < 1)
5616     IC = 1;
5617 
5618   // Interleave if we vectorized this loop and there is a reduction that could
5619   // benefit from interleaving.
5620   if (VF.isVector() && HasReductions) {
5621     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5622     return IC;
5623   }
5624 
5625   // Note that if we've already vectorized the loop we will have done the
5626   // runtime check and so interleaving won't require further checks.
5627   bool InterleavingRequiresRuntimePointerCheck =
5628       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5629 
5630   // We want to interleave small loops in order to reduce the loop overhead and
5631   // potentially expose ILP opportunities.
5632   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5633                     << "LV: IC is " << IC << '\n'
5634                     << "LV: VF is " << VF.getKnownMinValue() << '\n');
5635   const bool AggressivelyInterleaveReductions =
5636       TTI.enableAggressiveInterleaving(HasReductions);
5637   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5638     // We assume that the cost overhead is 1 and we use the cost model
5639     // to estimate the cost of the loop and interleave until the cost of the
5640     // loop overhead is about 5% of the cost of the loop.
5641     unsigned SmallIC =
5642         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5643 
5644     // Interleave until store/load ports (estimated by max interleave count) are
5645     // saturated.
5646     unsigned NumStores = Legal->getNumStores();
5647     unsigned NumLoads = Legal->getNumLoads();
5648     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5649     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5650 
5651     // If we have a scalar reduction (vector reductions are already dealt with
5652     // by this point), we can increase the critical path length if the loop
5653     // we're interleaving is inside another loop. Limit, by default to 2, so the
5654     // critical path only gets increased by one reduction operation.
5655     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5656       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5657       SmallIC = std::min(SmallIC, F);
5658       StoresIC = std::min(StoresIC, F);
5659       LoadsIC = std::min(LoadsIC, F);
5660     }
5661 
5662     if (EnableLoadStoreRuntimeInterleave &&
5663         std::max(StoresIC, LoadsIC) > SmallIC) {
5664       LLVM_DEBUG(
5665           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5666       return std::max(StoresIC, LoadsIC);
5667     }
5668 
5669     // If there are scalar reductions and TTI has enabled aggressive
5670     // interleaving for reductions, we will interleave to expose ILP.
5671     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5672         AggressivelyInterleaveReductions) {
5673       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5674       // Interleave no less than SmallIC but not as aggressive as the normal IC
5675       // to satisfy the rare situation when resources are too limited.
5676       return std::max(IC / 2, SmallIC);
5677     } else {
5678       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5679       return SmallIC;
5680     }
5681   }
5682 
5683   // Interleave if this is a large loop (small loops are already dealt with by
5684   // this point) that could benefit from interleaving.
5685   if (AggressivelyInterleaveReductions) {
5686     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5687     return IC;
5688   }
5689 
5690   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5691   return 1;
5692 }
5693 
5694 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5695 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5696   // This function calculates the register usage by measuring the highest number
5697   // of values that are alive at a single location. Obviously, this is a very
5698   // rough estimation. We scan the loop in a topological order in order and
5699   // assign a number to each instruction. We use RPO to ensure that defs are
5700   // met before their users. We assume that each instruction that has in-loop
5701   // users starts an interval. We record every time that an in-loop value is
5702   // used, so we have a list of the first and last occurrences of each
5703   // instruction. Next, we transpose this data structure into a multi map that
5704   // holds the list of intervals that *end* at a specific location. This multi
5705   // map allows us to perform a linear search. We scan the instructions linearly
5706   // and record each time that a new interval starts, by placing it in a set.
5707   // If we find this value in the multi-map then we remove it from the set.
5708   // The max register usage is the maximum size of the set.
5709   // We also search for instructions that are defined outside the loop, but are
5710   // used inside the loop. We need this number separately from the max-interval
5711   // usage number because when we unroll, loop-invariant values do not take
5712   // more register.
5713   LoopBlocksDFS DFS(TheLoop);
5714   DFS.perform(LI);
5715 
5716   RegisterUsage RU;
5717 
5718   // Each 'key' in the map opens a new interval. The values
5719   // of the map are the index of the 'last seen' usage of the
5720   // instruction that is the key.
5721   using IntervalMap = DenseMap<Instruction *, unsigned>;
5722 
5723   // Maps instruction to its index.
5724   SmallVector<Instruction *, 64> IdxToInstr;
5725   // Marks the end of each interval.
5726   IntervalMap EndPoint;
5727   // Saves the list of instruction indices that are used in the loop.
5728   SmallPtrSet<Instruction *, 8> Ends;
5729   // Saves the list of values that are used in the loop but are
5730   // defined outside the loop, such as arguments and constants.
5731   SmallPtrSet<Value *, 8> LoopInvariants;
5732 
5733   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5734     for (Instruction &I : BB->instructionsWithoutDebug()) {
5735       IdxToInstr.push_back(&I);
5736 
5737       // Save the end location of each USE.
5738       for (Value *U : I.operands()) {
5739         auto *Instr = dyn_cast<Instruction>(U);
5740 
5741         // Ignore non-instruction values such as arguments, constants, etc.
5742         if (!Instr)
5743           continue;
5744 
5745         // If this instruction is outside the loop then record it and continue.
5746         if (!TheLoop->contains(Instr)) {
5747           LoopInvariants.insert(Instr);
5748           continue;
5749         }
5750 
5751         // Overwrite previous end points.
5752         EndPoint[Instr] = IdxToInstr.size();
5753         Ends.insert(Instr);
5754       }
5755     }
5756   }
5757 
5758   // Saves the list of intervals that end with the index in 'key'.
5759   using InstrList = SmallVector<Instruction *, 2>;
5760   DenseMap<unsigned, InstrList> TransposeEnds;
5761 
5762   // Transpose the EndPoints to a list of values that end at each index.
5763   for (auto &Interval : EndPoint)
5764     TransposeEnds[Interval.second].push_back(Interval.first);
5765 
5766   SmallPtrSet<Instruction *, 8> OpenIntervals;
5767 
5768   // Get the size of the widest register.
5769   unsigned MaxSafeDepDist = -1U;
5770   if (Legal->getMaxSafeDepDistBytes() != -1U)
5771     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5772   unsigned WidestRegister =
5773       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5774   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5775 
5776   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5777   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5778 
5779   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5780 
5781   // A lambda that gets the register usage for the given type and VF.
5782   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, ElementCount VF) {
5783     if (Ty->isTokenTy())
5784       return 0U;
5785     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5786     assert(!VF.isScalable() && "scalable vectors not yet supported.");
5787     return std::max<unsigned>(1, VF.getKnownMinValue() * TypeSize /
5788                                      WidestRegister);
5789   };
5790 
5791   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5792     Instruction *I = IdxToInstr[i];
5793 
5794     // Remove all of the instructions that end at this location.
5795     InstrList &List = TransposeEnds[i];
5796     for (Instruction *ToRemove : List)
5797       OpenIntervals.erase(ToRemove);
5798 
5799     // Ignore instructions that are never used within the loop.
5800     if (!Ends.count(I))
5801       continue;
5802 
5803     // Skip ignored values.
5804     if (ValuesToIgnore.count(I))
5805       continue;
5806 
5807     // For each VF find the maximum usage of registers.
5808     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5809       // Count the number of live intervals.
5810       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5811 
5812       if (VFs[j].isScalar()) {
5813         for (auto Inst : OpenIntervals) {
5814           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5815           if (RegUsage.find(ClassID) == RegUsage.end())
5816             RegUsage[ClassID] = 1;
5817           else
5818             RegUsage[ClassID] += 1;
5819         }
5820       } else {
5821         collectUniformsAndScalars(VFs[j]);
5822         for (auto Inst : OpenIntervals) {
5823           // Skip ignored values for VF > 1.
5824           if (VecValuesToIgnore.count(Inst))
5825             continue;
5826           if (isScalarAfterVectorization(Inst, VFs[j])) {
5827             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5828             if (RegUsage.find(ClassID) == RegUsage.end())
5829               RegUsage[ClassID] = 1;
5830             else
5831               RegUsage[ClassID] += 1;
5832           } else {
5833             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5834             if (RegUsage.find(ClassID) == RegUsage.end())
5835               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5836             else
5837               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5838           }
5839         }
5840       }
5841 
5842       for (auto& pair : RegUsage) {
5843         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5844           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5845         else
5846           MaxUsages[j][pair.first] = pair.second;
5847       }
5848     }
5849 
5850     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5851                       << OpenIntervals.size() << '\n');
5852 
5853     // Add the current instruction to the list of open intervals.
5854     OpenIntervals.insert(I);
5855   }
5856 
5857   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5858     SmallMapVector<unsigned, unsigned, 4> Invariant;
5859 
5860     for (auto Inst : LoopInvariants) {
5861       unsigned Usage =
5862           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5863       unsigned ClassID =
5864           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
5865       if (Invariant.find(ClassID) == Invariant.end())
5866         Invariant[ClassID] = Usage;
5867       else
5868         Invariant[ClassID] += Usage;
5869     }
5870 
5871     LLVM_DEBUG({
5872       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5873       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5874              << " item\n";
5875       for (const auto &pair : MaxUsages[i]) {
5876         dbgs() << "LV(REG): RegisterClass: "
5877                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5878                << " registers\n";
5879       }
5880       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5881              << " item\n";
5882       for (const auto &pair : Invariant) {
5883         dbgs() << "LV(REG): RegisterClass: "
5884                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5885                << " registers\n";
5886       }
5887     });
5888 
5889     RU.LoopInvariantRegs = Invariant;
5890     RU.MaxLocalUsers = MaxUsages[i];
5891     RUs[i] = RU;
5892   }
5893 
5894   return RUs;
5895 }
5896 
5897 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5898   // TODO: Cost model for emulated masked load/store is completely
5899   // broken. This hack guides the cost model to use an artificially
5900   // high enough value to practically disable vectorization with such
5901   // operations, except where previously deployed legality hack allowed
5902   // using very low cost values. This is to avoid regressions coming simply
5903   // from moving "masked load/store" check from legality to cost model.
5904   // Masked Load/Gather emulation was previously never allowed.
5905   // Limited number of Masked Store/Scatter emulation was allowed.
5906   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5907   return isa<LoadInst>(I) ||
5908          (isa<StoreInst>(I) &&
5909           NumPredStores > NumberOfStoresToPredicate);
5910 }
5911 
5912 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5913   // If we aren't vectorizing the loop, or if we've already collected the
5914   // instructions to scalarize, there's nothing to do. Collection may already
5915   // have occurred if we have a user-selected VF and are now computing the
5916   // expected cost for interleaving.
5917   if (VF.isScalar() || VF.isZero() ||
5918       InstsToScalarize.find(VF) != InstsToScalarize.end())
5919     return;
5920 
5921   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5922   // not profitable to scalarize any instructions, the presence of VF in the
5923   // map will indicate that we've analyzed it already.
5924   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5925 
5926   // Find all the instructions that are scalar with predication in the loop and
5927   // determine if it would be better to not if-convert the blocks they are in.
5928   // If so, we also record the instructions to scalarize.
5929   for (BasicBlock *BB : TheLoop->blocks()) {
5930     if (!blockNeedsPredication(BB))
5931       continue;
5932     for (Instruction &I : *BB)
5933       if (isScalarWithPredication(&I)) {
5934         ScalarCostsTy ScalarCosts;
5935         // Do not apply discount logic if hacked cost is needed
5936         // for emulated masked memrefs.
5937         if (!useEmulatedMaskMemRefHack(&I) &&
5938             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5939           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5940         // Remember that BB will remain after vectorization.
5941         PredicatedBBsAfterVectorization.insert(BB);
5942       }
5943   }
5944 }
5945 
5946 int LoopVectorizationCostModel::computePredInstDiscount(
5947     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5948     ElementCount VF) {
5949   assert(!isUniformAfterVectorization(PredInst, VF) &&
5950          "Instruction marked uniform-after-vectorization will be predicated");
5951 
5952   // Initialize the discount to zero, meaning that the scalar version and the
5953   // vector version cost the same.
5954   int Discount = 0;
5955 
5956   // Holds instructions to analyze. The instructions we visit are mapped in
5957   // ScalarCosts. Those instructions are the ones that would be scalarized if
5958   // we find that the scalar version costs less.
5959   SmallVector<Instruction *, 8> Worklist;
5960 
5961   // Returns true if the given instruction can be scalarized.
5962   auto canBeScalarized = [&](Instruction *I) -> bool {
5963     // We only attempt to scalarize instructions forming a single-use chain
5964     // from the original predicated block that would otherwise be vectorized.
5965     // Although not strictly necessary, we give up on instructions we know will
5966     // already be scalar to avoid traversing chains that are unlikely to be
5967     // beneficial.
5968     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5969         isScalarAfterVectorization(I, VF))
5970       return false;
5971 
5972     // If the instruction is scalar with predication, it will be analyzed
5973     // separately. We ignore it within the context of PredInst.
5974     if (isScalarWithPredication(I))
5975       return false;
5976 
5977     // If any of the instruction's operands are uniform after vectorization,
5978     // the instruction cannot be scalarized. This prevents, for example, a
5979     // masked load from being scalarized.
5980     //
5981     // We assume we will only emit a value for lane zero of an instruction
5982     // marked uniform after vectorization, rather than VF identical values.
5983     // Thus, if we scalarize an instruction that uses a uniform, we would
5984     // create uses of values corresponding to the lanes we aren't emitting code
5985     // for. This behavior can be changed by allowing getScalarValue to clone
5986     // the lane zero values for uniforms rather than asserting.
5987     for (Use &U : I->operands())
5988       if (auto *J = dyn_cast<Instruction>(U.get()))
5989         if (isUniformAfterVectorization(J, VF))
5990           return false;
5991 
5992     // Otherwise, we can scalarize the instruction.
5993     return true;
5994   };
5995 
5996   // Compute the expected cost discount from scalarizing the entire expression
5997   // feeding the predicated instruction. We currently only consider expressions
5998   // that are single-use instruction chains.
5999   Worklist.push_back(PredInst);
6000   while (!Worklist.empty()) {
6001     Instruction *I = Worklist.pop_back_val();
6002 
6003     // If we've already analyzed the instruction, there's nothing to do.
6004     if (ScalarCosts.find(I) != ScalarCosts.end())
6005       continue;
6006 
6007     // Compute the cost of the vector instruction. Note that this cost already
6008     // includes the scalarization overhead of the predicated instruction.
6009     unsigned VectorCost = getInstructionCost(I, VF).first;
6010 
6011     // Compute the cost of the scalarized instruction. This cost is the cost of
6012     // the instruction as if it wasn't if-converted and instead remained in the
6013     // predicated block. We will scale this cost by block probability after
6014     // computing the scalarization overhead.
6015     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6016     unsigned ScalarCost =
6017         VF.getKnownMinValue() *
6018         getInstructionCost(I, ElementCount::getFixed(1)).first;
6019 
6020     // Compute the scalarization overhead of needed insertelement instructions
6021     // and phi nodes.
6022     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6023       ScalarCost += TTI.getScalarizationOverhead(
6024           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6025           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6026       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6027       ScalarCost +=
6028           VF.getKnownMinValue() *
6029           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6030     }
6031 
6032     // Compute the scalarization overhead of needed extractelement
6033     // instructions. For each of the instruction's operands, if the operand can
6034     // be scalarized, add it to the worklist; otherwise, account for the
6035     // overhead.
6036     for (Use &U : I->operands())
6037       if (auto *J = dyn_cast<Instruction>(U.get())) {
6038         assert(VectorType::isValidElementType(J->getType()) &&
6039                "Instruction has non-scalar type");
6040         if (canBeScalarized(J))
6041           Worklist.push_back(J);
6042         else if (needsExtract(J, VF)) {
6043           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6044           ScalarCost += TTI.getScalarizationOverhead(
6045               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6046               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6047         }
6048       }
6049 
6050     // Scale the total scalar cost by block probability.
6051     ScalarCost /= getReciprocalPredBlockProb();
6052 
6053     // Compute the discount. A non-negative discount means the vector version
6054     // of the instruction costs more, and scalarizing would be beneficial.
6055     Discount += VectorCost - ScalarCost;
6056     ScalarCosts[I] = ScalarCost;
6057   }
6058 
6059   return Discount;
6060 }
6061 
6062 LoopVectorizationCostModel::VectorizationCostTy
6063 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6064   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6065   VectorizationCostTy Cost;
6066 
6067   // For each block.
6068   for (BasicBlock *BB : TheLoop->blocks()) {
6069     VectorizationCostTy BlockCost;
6070 
6071     // For each instruction in the old loop.
6072     for (Instruction &I : BB->instructionsWithoutDebug()) {
6073       // Skip ignored values.
6074       if (ValuesToIgnore.count(&I) ||
6075           (VF.isVector() && VecValuesToIgnore.count(&I)))
6076         continue;
6077 
6078       VectorizationCostTy C = getInstructionCost(&I, VF);
6079 
6080       // Check if we should override the cost.
6081       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6082         C.first = ForceTargetInstructionCost;
6083 
6084       BlockCost.first += C.first;
6085       BlockCost.second |= C.second;
6086       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6087                         << " for VF " << VF << " For instruction: " << I
6088                         << '\n');
6089     }
6090 
6091     // If we are vectorizing a predicated block, it will have been
6092     // if-converted. This means that the block's instructions (aside from
6093     // stores and instructions that may divide by zero) will now be
6094     // unconditionally executed. For the scalar case, we may not always execute
6095     // the predicated block. Thus, scale the block's cost by the probability of
6096     // executing it.
6097     if (VF.isScalar() && blockNeedsPredication(BB))
6098       BlockCost.first /= getReciprocalPredBlockProb();
6099 
6100     Cost.first += BlockCost.first;
6101     Cost.second |= BlockCost.second;
6102   }
6103 
6104   return Cost;
6105 }
6106 
6107 /// Gets Address Access SCEV after verifying that the access pattern
6108 /// is loop invariant except the induction variable dependence.
6109 ///
6110 /// This SCEV can be sent to the Target in order to estimate the address
6111 /// calculation cost.
6112 static const SCEV *getAddressAccessSCEV(
6113               Value *Ptr,
6114               LoopVectorizationLegality *Legal,
6115               PredicatedScalarEvolution &PSE,
6116               const Loop *TheLoop) {
6117 
6118   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6119   if (!Gep)
6120     return nullptr;
6121 
6122   // We are looking for a gep with all loop invariant indices except for one
6123   // which should be an induction variable.
6124   auto SE = PSE.getSE();
6125   unsigned NumOperands = Gep->getNumOperands();
6126   for (unsigned i = 1; i < NumOperands; ++i) {
6127     Value *Opd = Gep->getOperand(i);
6128     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6129         !Legal->isInductionVariable(Opd))
6130       return nullptr;
6131   }
6132 
6133   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6134   return PSE.getSCEV(Ptr);
6135 }
6136 
6137 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6138   return Legal->hasStride(I->getOperand(0)) ||
6139          Legal->hasStride(I->getOperand(1));
6140 }
6141 
6142 unsigned
6143 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6144                                                         ElementCount VF) {
6145   assert(VF.isVector() &&
6146          "Scalarization cost of instruction implies vectorization.");
6147   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6148   Type *ValTy = getMemInstValueType(I);
6149   auto SE = PSE.getSE();
6150 
6151   unsigned AS = getLoadStoreAddressSpace(I);
6152   Value *Ptr = getLoadStorePointerOperand(I);
6153   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6154 
6155   // Figure out whether the access is strided and get the stride value
6156   // if it's known in compile time
6157   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6158 
6159   // Get the cost of the scalar memory instruction and address computation.
6160   unsigned Cost =
6161       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6162 
6163   // Don't pass *I here, since it is scalar but will actually be part of a
6164   // vectorized loop where the user of it is a vectorized instruction.
6165   const Align Alignment = getLoadStoreAlignment(I);
6166   Cost += VF.getKnownMinValue() *
6167           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6168                               AS, TTI::TCK_RecipThroughput);
6169 
6170   // Get the overhead of the extractelement and insertelement instructions
6171   // we might create due to scalarization.
6172   Cost += getScalarizationOverhead(I, VF);
6173 
6174   // If we have a predicated store, it may not be executed for each vector
6175   // lane. Scale the cost by the probability of executing the predicated
6176   // block.
6177   if (isPredicatedInst(I)) {
6178     Cost /= getReciprocalPredBlockProb();
6179 
6180     if (useEmulatedMaskMemRefHack(I))
6181       // Artificially setting to a high enough value to practically disable
6182       // vectorization with such operations.
6183       Cost = 3000000;
6184   }
6185 
6186   return Cost;
6187 }
6188 
6189 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6190                                                              ElementCount VF) {
6191   Type *ValTy = getMemInstValueType(I);
6192   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6193   Value *Ptr = getLoadStorePointerOperand(I);
6194   unsigned AS = getLoadStoreAddressSpace(I);
6195   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6196   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6197 
6198   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6199          "Stride should be 1 or -1 for consecutive memory access");
6200   const Align Alignment = getLoadStoreAlignment(I);
6201   unsigned Cost = 0;
6202   if (Legal->isMaskRequired(I))
6203     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6204                                       CostKind);
6205   else
6206     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6207                                 CostKind, I);
6208 
6209   bool Reverse = ConsecutiveStride < 0;
6210   if (Reverse)
6211     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6212   return Cost;
6213 }
6214 
6215 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6216                                                          ElementCount VF) {
6217   Type *ValTy = getMemInstValueType(I);
6218   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6219   const Align Alignment = getLoadStoreAlignment(I);
6220   unsigned AS = getLoadStoreAddressSpace(I);
6221   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6222   if (isa<LoadInst>(I)) {
6223     return TTI.getAddressComputationCost(ValTy) +
6224            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6225                                CostKind) +
6226            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6227   }
6228   StoreInst *SI = cast<StoreInst>(I);
6229 
6230   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6231   return TTI.getAddressComputationCost(ValTy) +
6232          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6233                              CostKind) +
6234          (isLoopInvariantStoreValue
6235               ? 0
6236               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6237                                        VF.getKnownMinValue() - 1));
6238 }
6239 
6240 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6241                                                           ElementCount VF) {
6242   Type *ValTy = getMemInstValueType(I);
6243   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6244   const Align Alignment = getLoadStoreAlignment(I);
6245   const Value *Ptr = getLoadStorePointerOperand(I);
6246 
6247   return TTI.getAddressComputationCost(VectorTy) +
6248          TTI.getGatherScatterOpCost(
6249              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6250              TargetTransformInfo::TCK_RecipThroughput, I);
6251 }
6252 
6253 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6254                                                             ElementCount VF) {
6255   Type *ValTy = getMemInstValueType(I);
6256   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6257   unsigned AS = getLoadStoreAddressSpace(I);
6258 
6259   auto Group = getInterleavedAccessGroup(I);
6260   assert(Group && "Fail to get an interleaved access group.");
6261 
6262   unsigned InterleaveFactor = Group->getFactor();
6263   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6264   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6265 
6266   // Holds the indices of existing members in an interleaved load group.
6267   // An interleaved store group doesn't need this as it doesn't allow gaps.
6268   SmallVector<unsigned, 4> Indices;
6269   if (isa<LoadInst>(I)) {
6270     for (unsigned i = 0; i < InterleaveFactor; i++)
6271       if (Group->getMember(i))
6272         Indices.push_back(i);
6273   }
6274 
6275   // Calculate the cost of the whole interleaved group.
6276   bool UseMaskForGaps =
6277       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6278   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6279       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6280       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6281 
6282   if (Group->isReverse()) {
6283     // TODO: Add support for reversed masked interleaved access.
6284     assert(!Legal->isMaskRequired(I) &&
6285            "Reverse masked interleaved access not supported.");
6286     Cost += Group->getNumMembers() *
6287             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6288   }
6289   return Cost;
6290 }
6291 
6292 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6293                                                               ElementCount VF) {
6294   // Calculate scalar cost only. Vectorization cost should be ready at this
6295   // moment.
6296   if (VF.isScalar()) {
6297     Type *ValTy = getMemInstValueType(I);
6298     const Align Alignment = getLoadStoreAlignment(I);
6299     unsigned AS = getLoadStoreAddressSpace(I);
6300 
6301     return TTI.getAddressComputationCost(ValTy) +
6302            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6303                                TTI::TCK_RecipThroughput, I);
6304   }
6305   return getWideningCost(I, VF);
6306 }
6307 
6308 LoopVectorizationCostModel::VectorizationCostTy
6309 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6310                                                ElementCount VF) {
6311   assert(!VF.isScalable() &&
6312          "the cost model is not yet implemented for scalable vectorization");
6313   // If we know that this instruction will remain uniform, check the cost of
6314   // the scalar version.
6315   if (isUniformAfterVectorization(I, VF))
6316     VF = ElementCount::getFixed(1);
6317 
6318   if (VF.isVector() && isProfitableToScalarize(I, VF))
6319     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6320 
6321   // Forced scalars do not have any scalarization overhead.
6322   auto ForcedScalar = ForcedScalars.find(VF);
6323   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6324     auto InstSet = ForcedScalar->second;
6325     if (InstSet.count(I))
6326       return VectorizationCostTy(
6327           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6328            VF.getKnownMinValue()),
6329           false);
6330   }
6331 
6332   Type *VectorTy;
6333   unsigned C = getInstructionCost(I, VF, VectorTy);
6334 
6335   bool TypeNotScalarized =
6336       VF.isVector() && VectorTy->isVectorTy() &&
6337       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6338   return VectorizationCostTy(C, TypeNotScalarized);
6339 }
6340 
6341 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6342                                                               ElementCount VF) {
6343 
6344   assert(!VF.isScalable() &&
6345          "cannot compute scalarization overhead for scalable vectorization");
6346   if (VF.isScalar())
6347     return 0;
6348 
6349   unsigned Cost = 0;
6350   Type *RetTy = ToVectorTy(I->getType(), VF);
6351   if (!RetTy->isVoidTy() &&
6352       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6353     Cost += TTI.getScalarizationOverhead(
6354         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6355         true, false);
6356 
6357   // Some targets keep addresses scalar.
6358   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6359     return Cost;
6360 
6361   // Some targets support efficient element stores.
6362   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6363     return Cost;
6364 
6365   // Collect operands to consider.
6366   CallInst *CI = dyn_cast<CallInst>(I);
6367   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6368 
6369   // Skip operands that do not require extraction/scalarization and do not incur
6370   // any overhead.
6371   return Cost + TTI.getOperandsScalarizationOverhead(
6372                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6373 }
6374 
6375 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6376   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6377   if (VF.isScalar())
6378     return;
6379   NumPredStores = 0;
6380   for (BasicBlock *BB : TheLoop->blocks()) {
6381     // For each instruction in the old loop.
6382     for (Instruction &I : *BB) {
6383       Value *Ptr =  getLoadStorePointerOperand(&I);
6384       if (!Ptr)
6385         continue;
6386 
6387       // TODO: We should generate better code and update the cost model for
6388       // predicated uniform stores. Today they are treated as any other
6389       // predicated store (see added test cases in
6390       // invariant-store-vectorization.ll).
6391       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6392         NumPredStores++;
6393 
6394       if (Legal->isUniform(Ptr) &&
6395           // Conditional loads and stores should be scalarized and predicated.
6396           // isScalarWithPredication cannot be used here since masked
6397           // gather/scatters are not considered scalar with predication.
6398           !Legal->blockNeedsPredication(I.getParent())) {
6399         // TODO: Avoid replicating loads and stores instead of
6400         // relying on instcombine to remove them.
6401         // Load: Scalar load + broadcast
6402         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6403         unsigned Cost = getUniformMemOpCost(&I, VF);
6404         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6405         continue;
6406       }
6407 
6408       // We assume that widening is the best solution when possible.
6409       if (memoryInstructionCanBeWidened(&I, VF)) {
6410         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6411         int ConsecutiveStride =
6412                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6413         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6414                "Expected consecutive stride.");
6415         InstWidening Decision =
6416             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6417         setWideningDecision(&I, VF, Decision, Cost);
6418         continue;
6419       }
6420 
6421       // Choose between Interleaving, Gather/Scatter or Scalarization.
6422       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6423       unsigned NumAccesses = 1;
6424       if (isAccessInterleaved(&I)) {
6425         auto Group = getInterleavedAccessGroup(&I);
6426         assert(Group && "Fail to get an interleaved access group.");
6427 
6428         // Make one decision for the whole group.
6429         if (getWideningDecision(&I, VF) != CM_Unknown)
6430           continue;
6431 
6432         NumAccesses = Group->getNumMembers();
6433         if (interleavedAccessCanBeWidened(&I, VF))
6434           InterleaveCost = getInterleaveGroupCost(&I, VF);
6435       }
6436 
6437       unsigned GatherScatterCost =
6438           isLegalGatherOrScatter(&I)
6439               ? getGatherScatterCost(&I, VF) * NumAccesses
6440               : std::numeric_limits<unsigned>::max();
6441 
6442       unsigned ScalarizationCost =
6443           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6444 
6445       // Choose better solution for the current VF,
6446       // write down this decision and use it during vectorization.
6447       unsigned Cost;
6448       InstWidening Decision;
6449       if (InterleaveCost <= GatherScatterCost &&
6450           InterleaveCost < ScalarizationCost) {
6451         Decision = CM_Interleave;
6452         Cost = InterleaveCost;
6453       } else if (GatherScatterCost < ScalarizationCost) {
6454         Decision = CM_GatherScatter;
6455         Cost = GatherScatterCost;
6456       } else {
6457         Decision = CM_Scalarize;
6458         Cost = ScalarizationCost;
6459       }
6460       // If the instructions belongs to an interleave group, the whole group
6461       // receives the same decision. The whole group receives the cost, but
6462       // the cost will actually be assigned to one instruction.
6463       if (auto Group = getInterleavedAccessGroup(&I))
6464         setWideningDecision(Group, VF, Decision, Cost);
6465       else
6466         setWideningDecision(&I, VF, Decision, Cost);
6467     }
6468   }
6469 
6470   // Make sure that any load of address and any other address computation
6471   // remains scalar unless there is gather/scatter support. This avoids
6472   // inevitable extracts into address registers, and also has the benefit of
6473   // activating LSR more, since that pass can't optimize vectorized
6474   // addresses.
6475   if (TTI.prefersVectorizedAddressing())
6476     return;
6477 
6478   // Start with all scalar pointer uses.
6479   SmallPtrSet<Instruction *, 8> AddrDefs;
6480   for (BasicBlock *BB : TheLoop->blocks())
6481     for (Instruction &I : *BB) {
6482       Instruction *PtrDef =
6483         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6484       if (PtrDef && TheLoop->contains(PtrDef) &&
6485           getWideningDecision(&I, VF) != CM_GatherScatter)
6486         AddrDefs.insert(PtrDef);
6487     }
6488 
6489   // Add all instructions used to generate the addresses.
6490   SmallVector<Instruction *, 4> Worklist;
6491   for (auto *I : AddrDefs)
6492     Worklist.push_back(I);
6493   while (!Worklist.empty()) {
6494     Instruction *I = Worklist.pop_back_val();
6495     for (auto &Op : I->operands())
6496       if (auto *InstOp = dyn_cast<Instruction>(Op))
6497         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6498             AddrDefs.insert(InstOp).second)
6499           Worklist.push_back(InstOp);
6500   }
6501 
6502   for (auto *I : AddrDefs) {
6503     if (isa<LoadInst>(I)) {
6504       // Setting the desired widening decision should ideally be handled in
6505       // by cost functions, but since this involves the task of finding out
6506       // if the loaded register is involved in an address computation, it is
6507       // instead changed here when we know this is the case.
6508       InstWidening Decision = getWideningDecision(I, VF);
6509       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6510         // Scalarize a widened load of address.
6511         setWideningDecision(
6512             I, VF, CM_Scalarize,
6513             (VF.getKnownMinValue() *
6514              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6515       else if (auto Group = getInterleavedAccessGroup(I)) {
6516         // Scalarize an interleave group of address loads.
6517         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6518           if (Instruction *Member = Group->getMember(I))
6519             setWideningDecision(
6520                 Member, VF, CM_Scalarize,
6521                 (VF.getKnownMinValue() *
6522                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6523         }
6524       }
6525     } else
6526       // Make sure I gets scalarized and a cost estimate without
6527       // scalarization overhead.
6528       ForcedScalars[VF].insert(I);
6529   }
6530 }
6531 
6532 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6533                                                         ElementCount VF,
6534                                                         Type *&VectorTy) {
6535   Type *RetTy = I->getType();
6536   if (canTruncateToMinimalBitwidth(I, VF))
6537     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6538   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6539   auto SE = PSE.getSE();
6540   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6541 
6542   // TODO: We need to estimate the cost of intrinsic calls.
6543   switch (I->getOpcode()) {
6544   case Instruction::GetElementPtr:
6545     // We mark this instruction as zero-cost because the cost of GEPs in
6546     // vectorized code depends on whether the corresponding memory instruction
6547     // is scalarized or not. Therefore, we handle GEPs with the memory
6548     // instruction cost.
6549     return 0;
6550   case Instruction::Br: {
6551     // In cases of scalarized and predicated instructions, there will be VF
6552     // predicated blocks in the vectorized loop. Each branch around these
6553     // blocks requires also an extract of its vector compare i1 element.
6554     bool ScalarPredicatedBB = false;
6555     BranchInst *BI = cast<BranchInst>(I);
6556     if (VF.isVector() && BI->isConditional() &&
6557         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6558          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6559       ScalarPredicatedBB = true;
6560 
6561     if (ScalarPredicatedBB) {
6562       // Return cost for branches around scalarized and predicated blocks.
6563       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6564       auto *Vec_i1Ty =
6565           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6566       return (TTI.getScalarizationOverhead(
6567                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6568                   false, true) +
6569               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
6570                VF.getKnownMinValue()));
6571     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6572       // The back-edge branch will remain, as will all scalar branches.
6573       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6574     else
6575       // This branch will be eliminated by if-conversion.
6576       return 0;
6577     // Note: We currently assume zero cost for an unconditional branch inside
6578     // a predicated block since it will become a fall-through, although we
6579     // may decide in the future to call TTI for all branches.
6580   }
6581   case Instruction::PHI: {
6582     auto *Phi = cast<PHINode>(I);
6583 
6584     // First-order recurrences are replaced by vector shuffles inside the loop.
6585     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6586     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6587       return TTI.getShuffleCost(
6588           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
6589           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
6590 
6591     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6592     // converted into select instructions. We require N - 1 selects per phi
6593     // node, where N is the number of incoming values.
6594     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6595       return (Phi->getNumIncomingValues() - 1) *
6596              TTI.getCmpSelInstrCost(
6597                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6598                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6599                  CostKind);
6600 
6601     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6602   }
6603   case Instruction::UDiv:
6604   case Instruction::SDiv:
6605   case Instruction::URem:
6606   case Instruction::SRem:
6607     // If we have a predicated instruction, it may not be executed for each
6608     // vector lane. Get the scalarization cost and scale this amount by the
6609     // probability of executing the predicated block. If the instruction is not
6610     // predicated, we fall through to the next case.
6611     if (VF.isVector() && isScalarWithPredication(I)) {
6612       unsigned Cost = 0;
6613 
6614       // These instructions have a non-void type, so account for the phi nodes
6615       // that we will create. This cost is likely to be zero. The phi node
6616       // cost, if any, should be scaled by the block probability because it
6617       // models a copy at the end of each predicated block.
6618       Cost += VF.getKnownMinValue() *
6619               TTI.getCFInstrCost(Instruction::PHI, CostKind);
6620 
6621       // The cost of the non-predicated instruction.
6622       Cost += VF.getKnownMinValue() *
6623               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6624 
6625       // The cost of insertelement and extractelement instructions needed for
6626       // scalarization.
6627       Cost += getScalarizationOverhead(I, VF);
6628 
6629       // Scale the cost by the probability of executing the predicated blocks.
6630       // This assumes the predicated block for each vector lane is equally
6631       // likely.
6632       return Cost / getReciprocalPredBlockProb();
6633     }
6634     LLVM_FALLTHROUGH;
6635   case Instruction::Add:
6636   case Instruction::FAdd:
6637   case Instruction::Sub:
6638   case Instruction::FSub:
6639   case Instruction::Mul:
6640   case Instruction::FMul:
6641   case Instruction::FDiv:
6642   case Instruction::FRem:
6643   case Instruction::Shl:
6644   case Instruction::LShr:
6645   case Instruction::AShr:
6646   case Instruction::And:
6647   case Instruction::Or:
6648   case Instruction::Xor: {
6649     // Since we will replace the stride by 1 the multiplication should go away.
6650     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6651       return 0;
6652     // Certain instructions can be cheaper to vectorize if they have a constant
6653     // second vector operand. One example of this are shifts on x86.
6654     Value *Op2 = I->getOperand(1);
6655     TargetTransformInfo::OperandValueProperties Op2VP;
6656     TargetTransformInfo::OperandValueKind Op2VK =
6657         TTI.getOperandInfo(Op2, Op2VP);
6658     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6659       Op2VK = TargetTransformInfo::OK_UniformValue;
6660 
6661     SmallVector<const Value *, 4> Operands(I->operand_values());
6662     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6663     return N * TTI.getArithmeticInstrCost(
6664                    I->getOpcode(), VectorTy, CostKind,
6665                    TargetTransformInfo::OK_AnyValue,
6666                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6667   }
6668   case Instruction::FNeg: {
6669     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
6670     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6671     return N * TTI.getArithmeticInstrCost(
6672                    I->getOpcode(), VectorTy, CostKind,
6673                    TargetTransformInfo::OK_AnyValue,
6674                    TargetTransformInfo::OK_AnyValue,
6675                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6676                    I->getOperand(0), I);
6677   }
6678   case Instruction::Select: {
6679     SelectInst *SI = cast<SelectInst>(I);
6680     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6681     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6682     Type *CondTy = SI->getCondition()->getType();
6683     if (!ScalarCond) {
6684       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
6685       CondTy = VectorType::get(CondTy, VF);
6686     }
6687     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
6688                                   CostKind, I);
6689   }
6690   case Instruction::ICmp:
6691   case Instruction::FCmp: {
6692     Type *ValTy = I->getOperand(0)->getType();
6693     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6694     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6695       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6696     VectorTy = ToVectorTy(ValTy, VF);
6697     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
6698                                   I);
6699   }
6700   case Instruction::Store:
6701   case Instruction::Load: {
6702     ElementCount Width = VF;
6703     if (Width.isVector()) {
6704       InstWidening Decision = getWideningDecision(I, Width);
6705       assert(Decision != CM_Unknown &&
6706              "CM decision should be taken at this point");
6707       if (Decision == CM_Scalarize)
6708         Width = ElementCount::getFixed(1);
6709     }
6710     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6711     return getMemoryInstructionCost(I, VF);
6712   }
6713   case Instruction::ZExt:
6714   case Instruction::SExt:
6715   case Instruction::FPToUI:
6716   case Instruction::FPToSI:
6717   case Instruction::FPExt:
6718   case Instruction::PtrToInt:
6719   case Instruction::IntToPtr:
6720   case Instruction::SIToFP:
6721   case Instruction::UIToFP:
6722   case Instruction::Trunc:
6723   case Instruction::FPTrunc:
6724   case Instruction::BitCast: {
6725     // Computes the CastContextHint from a Load/Store instruction.
6726     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6727       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6728              "Expected a load or a store!");
6729 
6730       if (VF.isScalar() || !TheLoop->contains(I))
6731         return TTI::CastContextHint::Normal;
6732 
6733       switch (getWideningDecision(I, VF)) {
6734       case LoopVectorizationCostModel::CM_GatherScatter:
6735         return TTI::CastContextHint::GatherScatter;
6736       case LoopVectorizationCostModel::CM_Interleave:
6737         return TTI::CastContextHint::Interleave;
6738       case LoopVectorizationCostModel::CM_Scalarize:
6739       case LoopVectorizationCostModel::CM_Widen:
6740         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6741                                         : TTI::CastContextHint::Normal;
6742       case LoopVectorizationCostModel::CM_Widen_Reverse:
6743         return TTI::CastContextHint::Reversed;
6744       case LoopVectorizationCostModel::CM_Unknown:
6745         llvm_unreachable("Instr did not go through cost modelling?");
6746       }
6747 
6748       llvm_unreachable("Unhandled case!");
6749     };
6750 
6751     unsigned Opcode = I->getOpcode();
6752     TTI::CastContextHint CCH = TTI::CastContextHint::None;
6753     // For Trunc, the context is the only user, which must be a StoreInst.
6754     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6755       if (I->hasOneUse())
6756         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
6757           CCH = ComputeCCH(Store);
6758     }
6759     // For Z/Sext, the context is the operand, which must be a LoadInst.
6760     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6761              Opcode == Instruction::FPExt) {
6762       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
6763         CCH = ComputeCCH(Load);
6764     }
6765 
6766     // We optimize the truncation of induction variables having constant
6767     // integer steps. The cost of these truncations is the same as the scalar
6768     // operation.
6769     if (isOptimizableIVTruncate(I, VF)) {
6770       auto *Trunc = cast<TruncInst>(I);
6771       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6772                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
6773     }
6774 
6775     Type *SrcScalarTy = I->getOperand(0)->getType();
6776     Type *SrcVecTy =
6777         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6778     if (canTruncateToMinimalBitwidth(I, VF)) {
6779       // This cast is going to be shrunk. This may remove the cast or it might
6780       // turn it into slightly different cast. For example, if MinBW == 16,
6781       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6782       //
6783       // Calculate the modified src and dest types.
6784       Type *MinVecTy = VectorTy;
6785       if (Opcode == Instruction::Trunc) {
6786         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6787         VectorTy =
6788             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6789       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
6790         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6791         VectorTy =
6792             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6793       }
6794     }
6795 
6796     assert(!VF.isScalable() && "VF is assumed to be non scalable");
6797     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
6798     return N *
6799            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
6800   }
6801   case Instruction::Call: {
6802     bool NeedToScalarize;
6803     CallInst *CI = cast<CallInst>(I);
6804     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6805     if (getVectorIntrinsicIDForCall(CI, TLI))
6806       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6807     return CallCost;
6808   }
6809   default:
6810     // The cost of executing VF copies of the scalar instruction. This opcode
6811     // is unknown. Assume that it is the same as 'mul'.
6812     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
6813                                        Instruction::Mul, VectorTy, CostKind) +
6814            getScalarizationOverhead(I, VF);
6815   } // end of switch.
6816 }
6817 
6818 char LoopVectorize::ID = 0;
6819 
6820 static const char lv_name[] = "Loop Vectorization";
6821 
6822 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6823 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6824 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6825 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6826 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6827 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6828 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6829 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6830 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6831 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6832 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6833 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6834 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6835 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6836 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
6837 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6838 
6839 namespace llvm {
6840 
6841 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6842 
6843 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6844                               bool VectorizeOnlyWhenForced) {
6845   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6846 }
6847 
6848 } // end namespace llvm
6849 
6850 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6851   // Check if the pointer operand of a load or store instruction is
6852   // consecutive.
6853   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6854     return Legal->isConsecutivePtr(Ptr);
6855   return false;
6856 }
6857 
6858 void LoopVectorizationCostModel::collectValuesToIgnore() {
6859   // Ignore ephemeral values.
6860   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6861 
6862   // Ignore type-promoting instructions we identified during reduction
6863   // detection.
6864   for (auto &Reduction : Legal->getReductionVars()) {
6865     RecurrenceDescriptor &RedDes = Reduction.second;
6866     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6867     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6868   }
6869   // Ignore type-casting instructions we identified during induction
6870   // detection.
6871   for (auto &Induction : Legal->getInductionVars()) {
6872     InductionDescriptor &IndDes = Induction.second;
6873     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6874     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6875   }
6876 }
6877 
6878 void LoopVectorizationCostModel::collectInLoopReductions() {
6879   // For the moment, without predicated reduction instructions, we do not
6880   // support inloop reductions whilst folding the tail, and hence in those cases
6881   // all reductions are currently out of the loop.
6882   if (foldTailByMasking())
6883     return;
6884 
6885   for (auto &Reduction : Legal->getReductionVars()) {
6886     PHINode *Phi = Reduction.first;
6887     RecurrenceDescriptor &RdxDesc = Reduction.second;
6888 
6889     // We don't collect reductions that are type promoted (yet).
6890     if (RdxDesc.getRecurrenceType() != Phi->getType())
6891       continue;
6892 
6893     // If the target would prefer this reduction to happen "in-loop", then we
6894     // want to record it as such.
6895     unsigned Opcode = RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind());
6896     if (!PreferInLoopReductions &&
6897         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
6898                                    TargetTransformInfo::ReductionFlags()))
6899       continue;
6900 
6901     // Check that we can correctly put the reductions into the loop, by
6902     // finding the chain of operations that leads from the phi to the loop
6903     // exit value.
6904     SmallVector<Instruction *, 4> ReductionOperations =
6905         RdxDesc.getReductionOpChain(Phi, TheLoop);
6906     bool InLoop = !ReductionOperations.empty();
6907     if (InLoop)
6908       InLoopReductionChains[Phi] = ReductionOperations;
6909     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6910                       << " reduction for phi: " << *Phi << "\n");
6911   }
6912 }
6913 
6914 // TODO: we could return a pair of values that specify the max VF and
6915 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6916 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6917 // doesn't have a cost model that can choose which plan to execute if
6918 // more than one is generated.
6919 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6920                                  LoopVectorizationCostModel &CM) {
6921   unsigned WidestType;
6922   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6923   return WidestVectorRegBits / WidestType;
6924 }
6925 
6926 VectorizationFactor
6927 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6928   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
6929   ElementCount VF = UserVF;
6930   // Outer loop handling: They may require CFG and instruction level
6931   // transformations before even evaluating whether vectorization is profitable.
6932   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6933   // the vectorization pipeline.
6934   if (!OrigLoop->isInnermost()) {
6935     // If the user doesn't provide a vectorization factor, determine a
6936     // reasonable one.
6937     if (UserVF.isZero()) {
6938       VF = ElementCount::getFixed(
6939           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
6940       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6941 
6942       // Make sure we have a VF > 1 for stress testing.
6943       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6944         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6945                           << "overriding computed VF.\n");
6946         VF = ElementCount::getFixed(4);
6947       }
6948     }
6949     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6950     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6951            "VF needs to be a power of two");
6952     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6953                       << "VF " << VF << " to build VPlans.\n");
6954     buildVPlans(VF.getKnownMinValue(), VF.getKnownMinValue());
6955 
6956     // For VPlan build stress testing, we bail out after VPlan construction.
6957     if (VPlanBuildStressTest)
6958       return VectorizationFactor::Disabled();
6959 
6960     return {VF, 0 /*Cost*/};
6961   }
6962 
6963   LLVM_DEBUG(
6964       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6965                 "VPlan-native path.\n");
6966   return VectorizationFactor::Disabled();
6967 }
6968 
6969 Optional<VectorizationFactor>
6970 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6971   assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
6972   assert(OrigLoop->isInnermost() && "Inner loop expected.");
6973   Optional<unsigned> MaybeMaxVF =
6974       CM.computeMaxVF(UserVF.getKnownMinValue(), UserIC);
6975   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6976     return None;
6977 
6978   // Invalidate interleave groups if all blocks of loop will be predicated.
6979   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6980       !useMaskedInterleavedAccesses(*TTI)) {
6981     LLVM_DEBUG(
6982         dbgs()
6983         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6984            "which requires masked-interleaved support.\n");
6985     if (CM.InterleaveInfo.invalidateGroups())
6986       // Invalidating interleave groups also requires invalidating all decisions
6987       // based on them, which includes widening decisions and uniform and scalar
6988       // values.
6989       CM.invalidateCostModelingDecisions();
6990   }
6991 
6992   if (!UserVF.isZero()) {
6993     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6994     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6995            "VF needs to be a power of two");
6996     // Collect the instructions (and their associated costs) that will be more
6997     // profitable to scalarize.
6998     CM.selectUserVectorizationFactor(UserVF);
6999     CM.collectInLoopReductions();
7000     buildVPlansWithVPRecipes(UserVF.getKnownMinValue(),
7001                              UserVF.getKnownMinValue());
7002     LLVM_DEBUG(printPlans(dbgs()));
7003     return {{UserVF, 0}};
7004   }
7005 
7006   unsigned MaxVF = MaybeMaxVF.getValue();
7007   assert(MaxVF != 0 && "MaxVF is zero.");
7008 
7009   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
7010     // Collect Uniform and Scalar instructions after vectorization with VF.
7011     CM.collectUniformsAndScalars(ElementCount::getFixed(VF));
7012 
7013     // Collect the instructions (and their associated costs) that will be more
7014     // profitable to scalarize.
7015     if (VF > 1)
7016       CM.collectInstsToScalarize(ElementCount::getFixed(VF));
7017   }
7018 
7019   CM.collectInLoopReductions();
7020 
7021   buildVPlansWithVPRecipes(1, MaxVF);
7022   LLVM_DEBUG(printPlans(dbgs()));
7023   if (MaxVF == 1)
7024     return VectorizationFactor::Disabled();
7025 
7026   // Select the optimal vectorization factor.
7027   return CM.selectVectorizationFactor(MaxVF);
7028 }
7029 
7030 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7031   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7032                     << '\n');
7033   BestVF = VF;
7034   BestUF = UF;
7035 
7036   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7037     return !Plan->hasVF(VF);
7038   });
7039   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7040 }
7041 
7042 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7043                                            DominatorTree *DT) {
7044   // Perform the actual loop transformation.
7045 
7046   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7047   VPCallbackILV CallbackILV(ILV);
7048 
7049   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7050 
7051   VPTransformState State{*BestVF, BestUF,      LI,
7052                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7053                          &ILV,    CallbackILV};
7054   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7055   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7056   State.CanonicalIV = ILV.Induction;
7057 
7058   //===------------------------------------------------===//
7059   //
7060   // Notice: any optimization or new instruction that go
7061   // into the code below should also be implemented in
7062   // the cost-model.
7063   //
7064   //===------------------------------------------------===//
7065 
7066   // 2. Copy and widen instructions from the old loop into the new loop.
7067   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7068   VPlans.front()->execute(&State);
7069 
7070   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7071   //    predication, updating analyses.
7072   ILV.fixVectorizedLoop();
7073 }
7074 
7075 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7076     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7077   BasicBlock *Latch = OrigLoop->getLoopLatch();
7078 
7079   // We create new control-flow for the vectorized loop, so the original
7080   // condition will be dead after vectorization if it's only used by the
7081   // branch.
7082   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
7083   if (Cmp && Cmp->hasOneUse())
7084     DeadInstructions.insert(Cmp);
7085 
7086   // We create new "steps" for induction variable updates to which the original
7087   // induction variables map. An original update instruction will be dead if
7088   // all its users except the induction variable are dead.
7089   for (auto &Induction : Legal->getInductionVars()) {
7090     PHINode *Ind = Induction.first;
7091     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7092     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7093           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7094         }))
7095       DeadInstructions.insert(IndUpdate);
7096 
7097     // We record as "Dead" also the type-casting instructions we had identified
7098     // during induction analysis. We don't need any handling for them in the
7099     // vectorized loop because we have proven that, under a proper runtime
7100     // test guarding the vectorized loop, the value of the phi, and the casted
7101     // value of the phi, are the same. The last instruction in this casting chain
7102     // will get its scalar/vector/widened def from the scalar/vector/widened def
7103     // of the respective phi node. Any other casts in the induction def-use chain
7104     // have no other uses outside the phi update chain, and will be ignored.
7105     InductionDescriptor &IndDes = Induction.second;
7106     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7107     DeadInstructions.insert(Casts.begin(), Casts.end());
7108   }
7109 }
7110 
7111 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7112 
7113 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7114 
7115 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7116                                         Instruction::BinaryOps BinOp) {
7117   // When unrolling and the VF is 1, we only need to add a simple scalar.
7118   Type *Ty = Val->getType();
7119   assert(!Ty->isVectorTy() && "Val must be a scalar");
7120 
7121   if (Ty->isFloatingPointTy()) {
7122     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7123 
7124     // Floating point operations had to be 'fast' to enable the unrolling.
7125     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7126     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7127   }
7128   Constant *C = ConstantInt::get(Ty, StartIdx);
7129   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7130 }
7131 
7132 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7133   SmallVector<Metadata *, 4> MDs;
7134   // Reserve first location for self reference to the LoopID metadata node.
7135   MDs.push_back(nullptr);
7136   bool IsUnrollMetadata = false;
7137   MDNode *LoopID = L->getLoopID();
7138   if (LoopID) {
7139     // First find existing loop unrolling disable metadata.
7140     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7141       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7142       if (MD) {
7143         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7144         IsUnrollMetadata =
7145             S && S->getString().startswith("llvm.loop.unroll.disable");
7146       }
7147       MDs.push_back(LoopID->getOperand(i));
7148     }
7149   }
7150 
7151   if (!IsUnrollMetadata) {
7152     // Add runtime unroll disable metadata.
7153     LLVMContext &Context = L->getHeader()->getContext();
7154     SmallVector<Metadata *, 1> DisableOperands;
7155     DisableOperands.push_back(
7156         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7157     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7158     MDs.push_back(DisableNode);
7159     MDNode *NewLoopID = MDNode::get(Context, MDs);
7160     // Set operand 0 to refer to the loop id itself.
7161     NewLoopID->replaceOperandWith(0, NewLoopID);
7162     L->setLoopID(NewLoopID);
7163   }
7164 }
7165 
7166 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7167     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7168   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
7169   bool PredicateAtRangeStart = Predicate(ElementCount::getFixed(Range.Start));
7170 
7171   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
7172     if (Predicate(ElementCount::getFixed(TmpVF)) != PredicateAtRangeStart) {
7173       Range.End = TmpVF;
7174       break;
7175     }
7176 
7177   return PredicateAtRangeStart;
7178 }
7179 
7180 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7181 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7182 /// of VF's starting at a given VF and extending it as much as possible. Each
7183 /// vectorization decision can potentially shorten this sub-range during
7184 /// buildVPlan().
7185 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
7186   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7187     VFRange SubRange = {VF, MaxVF + 1};
7188     VPlans.push_back(buildVPlan(SubRange));
7189     VF = SubRange.End;
7190   }
7191 }
7192 
7193 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7194                                          VPlanPtr &Plan) {
7195   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7196 
7197   // Look for cached value.
7198   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7199   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7200   if (ECEntryIt != EdgeMaskCache.end())
7201     return ECEntryIt->second;
7202 
7203   VPValue *SrcMask = createBlockInMask(Src, Plan);
7204 
7205   // The terminator has to be a branch inst!
7206   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7207   assert(BI && "Unexpected terminator found");
7208 
7209   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7210     return EdgeMaskCache[Edge] = SrcMask;
7211 
7212   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
7213   assert(EdgeMask && "No Edge Mask found for condition");
7214 
7215   if (BI->getSuccessor(0) != Dst)
7216     EdgeMask = Builder.createNot(EdgeMask);
7217 
7218   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7219     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7220 
7221   return EdgeMaskCache[Edge] = EdgeMask;
7222 }
7223 
7224 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7225   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7226 
7227   // Look for cached value.
7228   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7229   if (BCEntryIt != BlockMaskCache.end())
7230     return BCEntryIt->second;
7231 
7232   // All-one mask is modelled as no-mask following the convention for masked
7233   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7234   VPValue *BlockMask = nullptr;
7235 
7236   if (OrigLoop->getHeader() == BB) {
7237     if (!CM.blockNeedsPredication(BB))
7238       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7239 
7240     // Introduce the early-exit compare IV <= BTC to form header block mask.
7241     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7242     // Start by constructing the desired canonical IV.
7243     VPValue *IV = nullptr;
7244     if (Legal->getPrimaryInduction())
7245       IV = Plan->getVPValue(Legal->getPrimaryInduction());
7246     else {
7247       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7248       Builder.getInsertBlock()->appendRecipe(IVRecipe);
7249       IV = IVRecipe->getVPValue();
7250     }
7251     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7252     bool TailFolded = !CM.isScalarEpilogueAllowed();
7253 
7254     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7255       // While ActiveLaneMask is a binary op that consumes the loop tripcount
7256       // as a second argument, we only pass the IV here and extract the
7257       // tripcount from the transform state where codegen of the VP instructions
7258       // happen.
7259       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7260     } else {
7261       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7262     }
7263     return BlockMaskCache[BB] = BlockMask;
7264   }
7265 
7266   // This is the block mask. We OR all incoming edges.
7267   for (auto *Predecessor : predecessors(BB)) {
7268     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7269     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7270       return BlockMaskCache[BB] = EdgeMask;
7271 
7272     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7273       BlockMask = EdgeMask;
7274       continue;
7275     }
7276 
7277     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7278   }
7279 
7280   return BlockMaskCache[BB] = BlockMask;
7281 }
7282 
7283 VPWidenMemoryInstructionRecipe *
7284 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7285                                   VPlanPtr &Plan) {
7286   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7287          "Must be called with either a load or store");
7288 
7289   auto willWiden = [&](ElementCount VF) -> bool {
7290     assert(!VF.isScalable() && "unexpected scalable ElementCount");
7291     if (VF.isScalar())
7292       return false;
7293     LoopVectorizationCostModel::InstWidening Decision =
7294         CM.getWideningDecision(I, VF);
7295     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7296            "CM decision should be taken at this point.");
7297     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7298       return true;
7299     if (CM.isScalarAfterVectorization(I, VF) ||
7300         CM.isProfitableToScalarize(I, VF))
7301       return false;
7302     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7303   };
7304 
7305   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7306     return nullptr;
7307 
7308   VPValue *Mask = nullptr;
7309   if (Legal->isMaskRequired(I))
7310     Mask = createBlockInMask(I->getParent(), Plan);
7311 
7312   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
7313   if (LoadInst *Load = dyn_cast<LoadInst>(I))
7314     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
7315 
7316   StoreInst *Store = cast<StoreInst>(I);
7317   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
7318   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
7319 }
7320 
7321 VPWidenIntOrFpInductionRecipe *
7322 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
7323   // Check if this is an integer or fp induction. If so, build the recipe that
7324   // produces its scalar and vector values.
7325   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
7326   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
7327       II.getKind() == InductionDescriptor::IK_FpInduction)
7328     return new VPWidenIntOrFpInductionRecipe(Phi);
7329 
7330   return nullptr;
7331 }
7332 
7333 VPWidenIntOrFpInductionRecipe *
7334 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
7335                                                 VFRange &Range) const {
7336   // Optimize the special case where the source is a constant integer
7337   // induction variable. Notice that we can only optimize the 'trunc' case
7338   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7339   // (c) other casts depend on pointer size.
7340 
7341   // Determine whether \p K is a truncation based on an induction variable that
7342   // can be optimized.
7343   auto isOptimizableIVTruncate =
7344       [&](Instruction *K) -> std::function<bool(ElementCount)> {
7345     return [=](ElementCount VF) -> bool {
7346       return CM.isOptimizableIVTruncate(K, VF);
7347     };
7348   };
7349 
7350   if (LoopVectorizationPlanner::getDecisionAndClampRange(
7351           isOptimizableIVTruncate(I), Range))
7352     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
7353                                              I);
7354   return nullptr;
7355 }
7356 
7357 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
7358   // We know that all PHIs in non-header blocks are converted into selects, so
7359   // we don't have to worry about the insertion order and we can just use the
7360   // builder. At this point we generate the predication tree. There may be
7361   // duplications since this is a simple recursive scan, but future
7362   // optimizations will clean it up.
7363 
7364   SmallVector<VPValue *, 2> Operands;
7365   unsigned NumIncoming = Phi->getNumIncomingValues();
7366   for (unsigned In = 0; In < NumIncoming; In++) {
7367     VPValue *EdgeMask =
7368       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
7369     assert((EdgeMask || NumIncoming == 1) &&
7370            "Multiple predecessors with one having a full mask");
7371     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
7372     if (EdgeMask)
7373       Operands.push_back(EdgeMask);
7374   }
7375   return new VPBlendRecipe(Phi, Operands);
7376 }
7377 
7378 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
7379                                                    VPlan &Plan) const {
7380 
7381   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7382       [this, CI](ElementCount VF) {
7383         return CM.isScalarWithPredication(CI, VF);
7384       },
7385       Range);
7386 
7387   if (IsPredicated)
7388     return nullptr;
7389 
7390   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7391   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7392              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
7393     return nullptr;
7394 
7395   auto willWiden = [&](ElementCount VF) -> bool {
7396     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7397     // The following case may be scalarized depending on the VF.
7398     // The flag shows whether we use Intrinsic or a usual Call for vectorized
7399     // version of the instruction.
7400     // Is it beneficial to perform intrinsic call compared to lib call?
7401     bool NeedToScalarize = false;
7402     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
7403     bool UseVectorIntrinsic =
7404         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
7405     return UseVectorIntrinsic || !NeedToScalarize;
7406   };
7407 
7408   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7409     return nullptr;
7410 
7411   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
7412 }
7413 
7414 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7415   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7416          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7417   // Instruction should be widened, unless it is scalar after vectorization,
7418   // scalarization is profitable or it is predicated.
7419   auto WillScalarize = [this, I](ElementCount VF) -> bool {
7420     return CM.isScalarAfterVectorization(I, VF) ||
7421            CM.isProfitableToScalarize(I, VF) ||
7422            CM.isScalarWithPredication(I, VF);
7423   };
7424   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
7425                                                              Range);
7426 }
7427 
7428 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
7429   auto IsVectorizableOpcode = [](unsigned Opcode) {
7430     switch (Opcode) {
7431     case Instruction::Add:
7432     case Instruction::And:
7433     case Instruction::AShr:
7434     case Instruction::BitCast:
7435     case Instruction::FAdd:
7436     case Instruction::FCmp:
7437     case Instruction::FDiv:
7438     case Instruction::FMul:
7439     case Instruction::FNeg:
7440     case Instruction::FPExt:
7441     case Instruction::FPToSI:
7442     case Instruction::FPToUI:
7443     case Instruction::FPTrunc:
7444     case Instruction::FRem:
7445     case Instruction::FSub:
7446     case Instruction::ICmp:
7447     case Instruction::IntToPtr:
7448     case Instruction::LShr:
7449     case Instruction::Mul:
7450     case Instruction::Or:
7451     case Instruction::PtrToInt:
7452     case Instruction::SDiv:
7453     case Instruction::Select:
7454     case Instruction::SExt:
7455     case Instruction::Shl:
7456     case Instruction::SIToFP:
7457     case Instruction::SRem:
7458     case Instruction::Sub:
7459     case Instruction::Trunc:
7460     case Instruction::UDiv:
7461     case Instruction::UIToFP:
7462     case Instruction::URem:
7463     case Instruction::Xor:
7464     case Instruction::ZExt:
7465       return true;
7466     }
7467     return false;
7468   };
7469 
7470   if (!IsVectorizableOpcode(I->getOpcode()))
7471     return nullptr;
7472 
7473   // Success: widen this instruction.
7474   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
7475 }
7476 
7477 VPBasicBlock *VPRecipeBuilder::handleReplication(
7478     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
7479     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
7480     VPlanPtr &Plan) {
7481   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7482       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7483       Range);
7484 
7485   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7486       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
7487       Range);
7488 
7489   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
7490                                        IsUniform, IsPredicated);
7491   setRecipe(I, Recipe);
7492 
7493   // Find if I uses a predicated instruction. If so, it will use its scalar
7494   // value. Avoid hoisting the insert-element which packs the scalar value into
7495   // a vector value, as that happens iff all users use the vector value.
7496   for (auto &Op : I->operands())
7497     if (auto *PredInst = dyn_cast<Instruction>(Op))
7498       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
7499         PredInst2Recipe[PredInst]->setAlsoPack(false);
7500 
7501   // Finalize the recipe for Instr, first if it is not predicated.
7502   if (!IsPredicated) {
7503     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7504     VPBB->appendRecipe(Recipe);
7505     return VPBB;
7506   }
7507   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7508   assert(VPBB->getSuccessors().empty() &&
7509          "VPBB has successors when handling predicated replication.");
7510   // Record predicated instructions for above packing optimizations.
7511   PredInst2Recipe[I] = Recipe;
7512   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7513   VPBlockUtils::insertBlockAfter(Region, VPBB);
7514   auto *RegSucc = new VPBasicBlock();
7515   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7516   return RegSucc;
7517 }
7518 
7519 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7520                                                       VPRecipeBase *PredRecipe,
7521                                                       VPlanPtr &Plan) {
7522   // Instructions marked for predication are replicated and placed under an
7523   // if-then construct to prevent side-effects.
7524 
7525   // Generate recipes to compute the block mask for this region.
7526   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7527 
7528   // Build the triangular if-then region.
7529   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7530   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7531   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7532   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7533   auto *PHIRecipe =
7534       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7535   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7536   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7537   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7538 
7539   // Note: first set Entry as region entry and then connect successors starting
7540   // from it in order, to propagate the "parent" of each VPBasicBlock.
7541   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7542   VPBlockUtils::connectBlocks(Pred, Exit);
7543 
7544   return Region;
7545 }
7546 
7547 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
7548                                                       VFRange &Range,
7549                                                       VPlanPtr &Plan) {
7550   // First, check for specific widening recipes that deal with calls, memory
7551   // operations, inductions and Phi nodes.
7552   if (auto *CI = dyn_cast<CallInst>(Instr))
7553     return tryToWidenCall(CI, Range, *Plan);
7554 
7555   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
7556     return tryToWidenMemory(Instr, Range, Plan);
7557 
7558   VPRecipeBase *Recipe;
7559   if (auto Phi = dyn_cast<PHINode>(Instr)) {
7560     if (Phi->getParent() != OrigLoop->getHeader())
7561       return tryToBlend(Phi, Plan);
7562     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
7563       return Recipe;
7564     return new VPWidenPHIRecipe(Phi);
7565   }
7566 
7567   if (isa<TruncInst>(Instr) &&
7568       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
7569     return Recipe;
7570 
7571   if (!shouldWiden(Instr, Range))
7572     return nullptr;
7573 
7574   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
7575     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
7576                                 OrigLoop);
7577 
7578   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
7579     bool InvariantCond =
7580         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
7581     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
7582                                    InvariantCond);
7583   }
7584 
7585   return tryToWiden(Instr, *Plan);
7586 }
7587 
7588 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7589                                                         unsigned MaxVF) {
7590   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7591 
7592   // Collect conditions feeding internal conditional branches; they need to be
7593   // represented in VPlan for it to model masking.
7594   SmallPtrSet<Value *, 1> NeedDef;
7595 
7596   auto *Latch = OrigLoop->getLoopLatch();
7597   for (BasicBlock *BB : OrigLoop->blocks()) {
7598     if (BB == Latch)
7599       continue;
7600     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7601     if (Branch && Branch->isConditional())
7602       NeedDef.insert(Branch->getCondition());
7603   }
7604 
7605   // If the tail is to be folded by masking, the primary induction variable, if
7606   // exists needs to be represented in VPlan for it to model early-exit masking.
7607   // Also, both the Phi and the live-out instruction of each reduction are
7608   // required in order to introduce a select between them in VPlan.
7609   if (CM.foldTailByMasking()) {
7610     if (Legal->getPrimaryInduction())
7611       NeedDef.insert(Legal->getPrimaryInduction());
7612     for (auto &Reduction : Legal->getReductionVars()) {
7613       NeedDef.insert(Reduction.first);
7614       NeedDef.insert(Reduction.second.getLoopExitInstr());
7615     }
7616   }
7617 
7618   // Collect instructions from the original loop that will become trivially dead
7619   // in the vectorized loop. We don't need to vectorize these instructions. For
7620   // example, original induction update instructions can become dead because we
7621   // separately emit induction "steps" when generating code for the new loop.
7622   // Similarly, we create a new latch condition when setting up the structure
7623   // of the new loop, so the old one can become dead.
7624   SmallPtrSet<Instruction *, 4> DeadInstructions;
7625   collectTriviallyDeadInstructions(DeadInstructions);
7626 
7627   // Add assume instructions we need to drop to DeadInstructions, to prevent
7628   // them from being added to the VPlan.
7629   // TODO: We only need to drop assumes in blocks that get flattend. If the
7630   // control flow is preserved, we should keep them.
7631   auto &ConditionalAssumes = Legal->getConditionalAssumes();
7632   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
7633 
7634   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7635   // Dead instructions do not need sinking. Remove them from SinkAfter.
7636   for (Instruction *I : DeadInstructions)
7637     SinkAfter.erase(I);
7638 
7639   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7640     VFRange SubRange = {VF, MaxVF + 1};
7641     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7642                                              DeadInstructions, SinkAfter));
7643     VF = SubRange.End;
7644   }
7645 }
7646 
7647 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7648     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7649     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7650     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7651 
7652   // Hold a mapping from predicated instructions to their recipes, in order to
7653   // fix their AlsoPack behavior if a user is determined to replicate and use a
7654   // scalar instead of vector value.
7655   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7656 
7657   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7658 
7659   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
7660 
7661   // ---------------------------------------------------------------------------
7662   // Pre-construction: record ingredients whose recipes we'll need to further
7663   // process after constructing the initial VPlan.
7664   // ---------------------------------------------------------------------------
7665 
7666   // Mark instructions we'll need to sink later and their targets as
7667   // ingredients whose recipe we'll need to record.
7668   for (auto &Entry : SinkAfter) {
7669     RecipeBuilder.recordRecipeOf(Entry.first);
7670     RecipeBuilder.recordRecipeOf(Entry.second);
7671   }
7672   for (auto &Reduction : CM.getInLoopReductionChains()) {
7673     PHINode *Phi = Reduction.first;
7674     RecurrenceDescriptor::RecurrenceKind Kind =
7675         Legal->getReductionVars()[Phi].getRecurrenceKind();
7676     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7677 
7678     RecipeBuilder.recordRecipeOf(Phi);
7679     for (auto &R : ReductionOperations) {
7680       RecipeBuilder.recordRecipeOf(R);
7681       // For min/max reducitons, where we have a pair of icmp/select, we also
7682       // need to record the ICmp recipe, so it can be removed later.
7683       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7684           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7685         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
7686       }
7687     }
7688   }
7689 
7690   // For each interleave group which is relevant for this (possibly trimmed)
7691   // Range, add it to the set of groups to be later applied to the VPlan and add
7692   // placeholders for its members' Recipes which we'll be replacing with a
7693   // single VPInterleaveRecipe.
7694   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7695     auto applyIG = [IG, this](ElementCount VF) -> bool {
7696       return (VF.isVector() && // Query is illegal for VF == 1
7697               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7698                   LoopVectorizationCostModel::CM_Interleave);
7699     };
7700     if (!getDecisionAndClampRange(applyIG, Range))
7701       continue;
7702     InterleaveGroups.insert(IG);
7703     for (unsigned i = 0; i < IG->getFactor(); i++)
7704       if (Instruction *Member = IG->getMember(i))
7705         RecipeBuilder.recordRecipeOf(Member);
7706   };
7707 
7708   // ---------------------------------------------------------------------------
7709   // Build initial VPlan: Scan the body of the loop in a topological order to
7710   // visit each basic block after having visited its predecessor basic blocks.
7711   // ---------------------------------------------------------------------------
7712 
7713   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7714   auto Plan = std::make_unique<VPlan>();
7715   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7716   Plan->setEntry(VPBB);
7717 
7718   // Represent values that will have defs inside VPlan.
7719   for (Value *V : NeedDef)
7720     Plan->addVPValue(V);
7721 
7722   // Scan the body of the loop in a topological order to visit each basic block
7723   // after having visited its predecessor basic blocks.
7724   LoopBlocksDFS DFS(OrigLoop);
7725   DFS.perform(LI);
7726 
7727   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7728     // Relevant instructions from basic block BB will be grouped into VPRecipe
7729     // ingredients and fill a new VPBasicBlock.
7730     unsigned VPBBsForBB = 0;
7731     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7732     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7733     VPBB = FirstVPBBForBB;
7734     Builder.setInsertPoint(VPBB);
7735 
7736     // Introduce each ingredient into VPlan.
7737     // TODO: Model and preserve debug instrinsics in VPlan.
7738     for (Instruction &I : BB->instructionsWithoutDebug()) {
7739       Instruction *Instr = &I;
7740 
7741       // First filter out irrelevant instructions, to ensure no recipes are
7742       // built for them.
7743       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
7744         continue;
7745 
7746       if (auto Recipe =
7747               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
7748         RecipeBuilder.setRecipe(Instr, Recipe);
7749         VPBB->appendRecipe(Recipe);
7750         continue;
7751       }
7752 
7753       // Otherwise, if all widening options failed, Instruction is to be
7754       // replicated. This may create a successor for VPBB.
7755       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7756           Instr, Range, VPBB, PredInst2Recipe, Plan);
7757       if (NextVPBB != VPBB) {
7758         VPBB = NextVPBB;
7759         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7760                                     : "");
7761       }
7762     }
7763   }
7764 
7765   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7766   // may also be empty, such as the last one VPBB, reflecting original
7767   // basic-blocks with no recipes.
7768   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7769   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7770   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7771   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7772   delete PreEntry;
7773 
7774   // ---------------------------------------------------------------------------
7775   // Transform initial VPlan: Apply previously taken decisions, in order, to
7776   // bring the VPlan to its final state.
7777   // ---------------------------------------------------------------------------
7778 
7779   // Apply Sink-After legal constraints.
7780   for (auto &Entry : SinkAfter) {
7781     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7782     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7783     Sink->moveAfter(Target);
7784   }
7785 
7786   // Interleave memory: for each Interleave Group we marked earlier as relevant
7787   // for this VPlan, replace the Recipes widening its memory instructions with a
7788   // single VPInterleaveRecipe at its insertion point.
7789   for (auto IG : InterleaveGroups) {
7790     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7791         RecipeBuilder.getRecipe(IG->getInsertPos()));
7792     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7793         ->insertBefore(Recipe);
7794 
7795     for (unsigned i = 0; i < IG->getFactor(); ++i)
7796       if (Instruction *Member = IG->getMember(i)) {
7797         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7798       }
7799   }
7800 
7801   // Adjust the recipes for any inloop reductions.
7802   if (Range.Start > 1)
7803     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
7804 
7805   // Finally, if tail is folded by masking, introduce selects between the phi
7806   // and the live-out instruction of each reduction, at the end of the latch.
7807   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
7808     Builder.setInsertPoint(VPBB);
7809     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7810     for (auto &Reduction : Legal->getReductionVars()) {
7811       assert(!CM.isInLoopReduction(Reduction.first) &&
7812              "Didn't expect inloop tail folded reduction yet!");
7813       VPValue *Phi = Plan->getVPValue(Reduction.first);
7814       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7815       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7816     }
7817   }
7818 
7819   std::string PlanName;
7820   raw_string_ostream RSO(PlanName);
7821   ElementCount VF = ElementCount::getFixed(Range.Start);
7822   Plan->addVF(VF);
7823   RSO << "Initial VPlan for VF={" << VF;
7824   for (VF *= 2; VF.getKnownMinValue() < Range.End; VF *= 2) {
7825     Plan->addVF(VF);
7826     RSO << "," << VF;
7827   }
7828   RSO << "},UF>=1";
7829   RSO.flush();
7830   Plan->setName(PlanName);
7831 
7832   return Plan;
7833 }
7834 
7835 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7836   // Outer loop handling: They may require CFG and instruction level
7837   // transformations before even evaluating whether vectorization is profitable.
7838   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7839   // the vectorization pipeline.
7840   assert(!OrigLoop->isInnermost());
7841   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7842 
7843   // Create new empty VPlan
7844   auto Plan = std::make_unique<VPlan>();
7845 
7846   // Build hierarchical CFG
7847   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7848   HCFGBuilder.buildHierarchicalCFG();
7849 
7850   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7851     Plan->addVF(ElementCount::getFixed(VF));
7852 
7853   if (EnableVPlanPredication) {
7854     VPlanPredicator VPP(*Plan);
7855     VPP.predicate();
7856 
7857     // Avoid running transformation to recipes until masked code generation in
7858     // VPlan-native path is in place.
7859     return Plan;
7860   }
7861 
7862   SmallPtrSet<Instruction *, 1> DeadInstructions;
7863   VPlanTransforms::VPInstructionsToVPRecipes(
7864       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7865   return Plan;
7866 }
7867 
7868 // Adjust the recipes for any inloop reductions. The chain of instructions
7869 // leading from the loop exit instr to the phi need to be converted to
7870 // reductions, with one operand being vector and the other being the scalar
7871 // reduction chain.
7872 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
7873     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
7874   for (auto &Reduction : CM.getInLoopReductionChains()) {
7875     PHINode *Phi = Reduction.first;
7876     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
7877     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
7878 
7879     // ReductionOperations are orders top-down from the phi's use to the
7880     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
7881     // which of the two operands will remain scalar and which will be reduced.
7882     // For minmax the chain will be the select instructions.
7883     Instruction *Chain = Phi;
7884     for (Instruction *R : ReductionOperations) {
7885       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
7886       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
7887 
7888       VPValue *ChainOp = Plan->getVPValue(Chain);
7889       unsigned FirstOpId;
7890       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7891           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7892         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
7893                "Expected to replace a VPWidenSelectSC");
7894         FirstOpId = 1;
7895       } else {
7896         assert(isa<VPWidenRecipe>(WidenRecipe) &&
7897                "Expected to replace a VPWidenSC");
7898         FirstOpId = 0;
7899       }
7900       unsigned VecOpId =
7901           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
7902       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
7903 
7904       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
7905           &RdxDesc, R, ChainOp, VecOp, Legal->hasFunNoNaNAttr(), TTI);
7906       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
7907       WidenRecipe->eraseFromParent();
7908 
7909       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
7910           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
7911         VPRecipeBase *CompareRecipe =
7912             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
7913         assert(isa<VPWidenRecipe>(CompareRecipe) &&
7914                "Expected to replace a VPWidenSC");
7915         CompareRecipe->eraseFromParent();
7916       }
7917       Chain = R;
7918     }
7919   }
7920 }
7921 
7922 Value* LoopVectorizationPlanner::VPCallbackILV::
7923 getOrCreateVectorValues(Value *V, unsigned Part) {
7924       return ILV.getOrCreateVectorValue(V, Part);
7925 }
7926 
7927 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7928     Value *V, const VPIteration &Instance) {
7929   return ILV.getOrCreateScalarValue(V, Instance);
7930 }
7931 
7932 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
7933                                VPSlotTracker &SlotTracker) const {
7934   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7935   IG->getInsertPos()->printAsOperand(O, false);
7936   O << ", ";
7937   getAddr()->printAsOperand(O, SlotTracker);
7938   VPValue *Mask = getMask();
7939   if (Mask) {
7940     O << ", ";
7941     Mask->printAsOperand(O, SlotTracker);
7942   }
7943   for (unsigned i = 0; i < IG->getFactor(); ++i)
7944     if (Instruction *I = IG->getMember(i))
7945       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
7946 }
7947 
7948 void VPWidenCallRecipe::execute(VPTransformState &State) {
7949   State.ILV->widenCallInstruction(Ingredient, *this, State);
7950 }
7951 
7952 void VPWidenSelectRecipe::execute(VPTransformState &State) {
7953   State.ILV->widenSelectInstruction(Ingredient, *this, InvariantCond, State);
7954 }
7955 
7956 void VPWidenRecipe::execute(VPTransformState &State) {
7957   State.ILV->widenInstruction(Ingredient, *this, State);
7958 }
7959 
7960 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7961   State.ILV->widenGEP(GEP, *this, State.UF, State.VF, IsPtrLoopInvariant,
7962                       IsIndexLoopInvariant, State);
7963 }
7964 
7965 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7966   assert(!State.Instance && "Int or FP induction being replicated.");
7967   State.ILV->widenIntOrFpInduction(IV, Trunc);
7968 }
7969 
7970 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7971   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7972 }
7973 
7974 void VPBlendRecipe::execute(VPTransformState &State) {
7975   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7976   // We know that all PHIs in non-header blocks are converted into
7977   // selects, so we don't have to worry about the insertion order and we
7978   // can just use the builder.
7979   // At this point we generate the predication tree. There may be
7980   // duplications since this is a simple recursive scan, but future
7981   // optimizations will clean it up.
7982 
7983   unsigned NumIncoming = getNumIncomingValues();
7984 
7985   // Generate a sequence of selects of the form:
7986   // SELECT(Mask3, In3,
7987   //        SELECT(Mask2, In2,
7988   //               SELECT(Mask1, In1,
7989   //                      In0)))
7990   // Note that Mask0 is never used: lanes for which no path reaches this phi and
7991   // are essentially undef are taken from In0.
7992   InnerLoopVectorizer::VectorParts Entry(State.UF);
7993   for (unsigned In = 0; In < NumIncoming; ++In) {
7994     for (unsigned Part = 0; Part < State.UF; ++Part) {
7995       // We might have single edge PHIs (blocks) - use an identity
7996       // 'select' for the first PHI operand.
7997       Value *In0 = State.get(getIncomingValue(In), Part);
7998       if (In == 0)
7999         Entry[Part] = In0; // Initialize with the first incoming value.
8000       else {
8001         // Select between the current value and the previous incoming edge
8002         // based on the incoming mask.
8003         Value *Cond = State.get(getMask(In), Part);
8004         Entry[Part] =
8005             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8006       }
8007     }
8008   }
8009   for (unsigned Part = 0; Part < State.UF; ++Part)
8010     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8011 }
8012 
8013 void VPInterleaveRecipe::execute(VPTransformState &State) {
8014   assert(!State.Instance && "Interleave group being replicated.");
8015   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
8016 }
8017 
8018 void VPReductionRecipe::execute(VPTransformState &State) {
8019   assert(!State.Instance && "Reduction being replicated.");
8020   for (unsigned Part = 0; Part < State.UF; ++Part) {
8021     unsigned Kind = RdxDesc->getRecurrenceKind();
8022     Value *NewVecOp = State.get(VecOp, Part);
8023     Value *NewRed =
8024         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
8025     Value *PrevInChain = State.get(ChainOp, Part);
8026     Value *NextInChain;
8027     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8028         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8029       NextInChain =
8030           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
8031                          NewRed, PrevInChain);
8032     } else {
8033       NextInChain = State.Builder.CreateBinOp(
8034           (Instruction::BinaryOps)I->getOpcode(), NewRed, PrevInChain);
8035     }
8036     State.ValueMap.setVectorValue(I, Part, NextInChain);
8037   }
8038 }
8039 
8040 void VPReplicateRecipe::execute(VPTransformState &State) {
8041   if (State.Instance) { // Generate a single instance.
8042     State.ILV->scalarizeInstruction(Ingredient, *this, *State.Instance,
8043                                     IsPredicated, State);
8044     // Insert scalar instance packing it into a vector.
8045     if (AlsoPack && State.VF.isVector()) {
8046       // If we're constructing lane 0, initialize to start from undef.
8047       if (State.Instance->Lane == 0) {
8048         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8049         Value *Undef =
8050             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
8051         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
8052       }
8053       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
8054     }
8055     return;
8056   }
8057 
8058   // Generate scalar instances for all VF lanes of all UF parts, unless the
8059   // instruction is uniform inwhich case generate only the first lane for each
8060   // of the UF parts.
8061   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8062   for (unsigned Part = 0; Part < State.UF; ++Part)
8063     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8064       State.ILV->scalarizeInstruction(Ingredient, *this, {Part, Lane},
8065                                       IsPredicated, State);
8066 }
8067 
8068 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8069   assert(State.Instance && "Branch on Mask works only on single instance.");
8070 
8071   unsigned Part = State.Instance->Part;
8072   unsigned Lane = State.Instance->Lane;
8073 
8074   Value *ConditionBit = nullptr;
8075   VPValue *BlockInMask = getMask();
8076   if (BlockInMask) {
8077     ConditionBit = State.get(BlockInMask, Part);
8078     if (ConditionBit->getType()->isVectorTy())
8079       ConditionBit = State.Builder.CreateExtractElement(
8080           ConditionBit, State.Builder.getInt32(Lane));
8081   } else // Block in mask is all-one.
8082     ConditionBit = State.Builder.getTrue();
8083 
8084   // Replace the temporary unreachable terminator with a new conditional branch,
8085   // whose two destinations will be set later when they are created.
8086   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8087   assert(isa<UnreachableInst>(CurrentTerminator) &&
8088          "Expected to replace unreachable terminator with conditional branch.");
8089   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8090   CondBr->setSuccessor(0, nullptr);
8091   ReplaceInstWithInst(CurrentTerminator, CondBr);
8092 }
8093 
8094 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8095   assert(State.Instance && "Predicated instruction PHI works per instance.");
8096   Instruction *ScalarPredInst = cast<Instruction>(
8097       State.ValueMap.getScalarValue(PredInst, *State.Instance));
8098   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8099   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8100   assert(PredicatingBB && "Predicated block has no single predecessor.");
8101 
8102   // By current pack/unpack logic we need to generate only a single phi node: if
8103   // a vector value for the predicated instruction exists at this point it means
8104   // the instruction has vector users only, and a phi for the vector value is
8105   // needed. In this case the recipe of the predicated instruction is marked to
8106   // also do that packing, thereby "hoisting" the insert-element sequence.
8107   // Otherwise, a phi node for the scalar value is needed.
8108   unsigned Part = State.Instance->Part;
8109   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8110     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8111     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8112     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8113     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8114     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8115     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8116   } else {
8117     Type *PredInstType = PredInst->getType();
8118     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8119     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8120     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8121     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8122   }
8123 }
8124 
8125 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8126   VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
8127   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
8128                                         getMask());
8129 }
8130 
8131 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8132 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8133 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8134 // for predication.
8135 static ScalarEpilogueLowering getScalarEpilogueLowering(
8136     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8137     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8138     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8139     LoopVectorizationLegality &LVL) {
8140   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8141   // don't look at hints or options, and don't request a scalar epilogue.
8142   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8143   // LoopAccessInfo (due to code dependency and not being able to reliably get
8144   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8145   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8146   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8147   // back to the old way and vectorize with versioning when forced. See D81345.)
8148   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8149                                                       PGSOQueryType::IRPass) &&
8150                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8151     return CM_ScalarEpilogueNotAllowedOptSize;
8152 
8153   bool PredicateOptDisabled = PreferPredicateOverEpilogue.getNumOccurrences() &&
8154                               !PreferPredicateOverEpilogue;
8155 
8156   // 2) Next, if disabling predication is requested on the command line, honour
8157   // this and request a scalar epilogue.
8158   if (PredicateOptDisabled)
8159     return CM_ScalarEpilogueAllowed;
8160 
8161   // 3) and 4) look if enabling predication is requested on the command line,
8162   // with a loop hint, or if the TTI hook indicates this is profitable, request
8163   // predication.
8164   if (PreferPredicateOverEpilogue ||
8165       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
8166       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8167                                         LVL.getLAI()) &&
8168        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
8169     return CM_ScalarEpilogueNotNeededUsePredicate;
8170 
8171   return CM_ScalarEpilogueAllowed;
8172 }
8173 
8174 // Process the loop in the VPlan-native vectorization path. This path builds
8175 // VPlan upfront in the vectorization pipeline, which allows to apply
8176 // VPlan-to-VPlan transformations from the very beginning without modifying the
8177 // input LLVM IR.
8178 static bool processLoopInVPlanNativePath(
8179     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8180     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8181     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8182     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8183     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8184 
8185   if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
8186     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8187     return false;
8188   }
8189   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8190   Function *F = L->getHeader()->getParent();
8191   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8192 
8193   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8194       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8195 
8196   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8197                                 &Hints, IAI);
8198   // Use the planner for outer loop vectorization.
8199   // TODO: CM is not used at this point inside the planner. Turn CM into an
8200   // optional argument if we don't need it in the future.
8201   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8202 
8203   // Get user vectorization factor.
8204   const unsigned UserVF = Hints.getWidth();
8205 
8206   // Plan how to best vectorize, return the best VF and its cost.
8207   const VectorizationFactor VF =
8208       LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF));
8209 
8210   // If we are stress testing VPlan builds, do not attempt to generate vector
8211   // code. Masked vector code generation support will follow soon.
8212   // Also, do not attempt to vectorize if no vector code will be produced.
8213   if (VPlanBuildStressTest || EnableVPlanPredication ||
8214       VectorizationFactor::Disabled() == VF)
8215     return false;
8216 
8217   LVP.setBestPlan(VF.Width, 1);
8218 
8219   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8220                          &CM, BFI, PSI);
8221   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8222                     << L->getHeader()->getParent()->getName() << "\"\n");
8223   LVP.executePlan(LB, DT);
8224 
8225   // Mark the loop as already vectorized to avoid vectorizing again.
8226   Hints.setAlreadyVectorized();
8227 
8228   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8229   return true;
8230 }
8231 
8232 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8233     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8234                                !EnableLoopInterleaving),
8235       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8236                               !EnableLoopVectorization) {}
8237 
8238 bool LoopVectorizePass::processLoop(Loop *L) {
8239   assert((EnableVPlanNativePath || L->isInnermost()) &&
8240          "VPlan-native path is not enabled. Only process inner loops.");
8241 
8242 #ifndef NDEBUG
8243   const std::string DebugLocStr = getDebugLocString(L);
8244 #endif /* NDEBUG */
8245 
8246   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8247                     << L->getHeader()->getParent()->getName() << "\" from "
8248                     << DebugLocStr << "\n");
8249 
8250   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8251 
8252   LLVM_DEBUG(
8253       dbgs() << "LV: Loop hints:"
8254              << " force="
8255              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8256                      ? "disabled"
8257                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8258                             ? "enabled"
8259                             : "?"))
8260              << " width=" << Hints.getWidth()
8261              << " unroll=" << Hints.getInterleave() << "\n");
8262 
8263   // Function containing loop
8264   Function *F = L->getHeader()->getParent();
8265 
8266   // Looking at the diagnostic output is the only way to determine if a loop
8267   // was vectorized (other than looking at the IR or machine code), so it
8268   // is important to generate an optimization remark for each loop. Most of
8269   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8270   // generated as OptimizationRemark and OptimizationRemarkMissed are
8271   // less verbose reporting vectorized loops and unvectorized loops that may
8272   // benefit from vectorization, respectively.
8273 
8274   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8275     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
8276     return false;
8277   }
8278 
8279   PredicatedScalarEvolution PSE(*SE, *L);
8280 
8281   // Check if it is legal to vectorize the loop.
8282   LoopVectorizationRequirements Requirements(*ORE);
8283   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
8284                                 &Requirements, &Hints, DB, AC, BFI, PSI);
8285   if (!LVL.canVectorize(EnableVPlanNativePath)) {
8286     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
8287     Hints.emitRemarkWithHints();
8288     return false;
8289   }
8290 
8291   // Check the function attributes and profiles to find out if this function
8292   // should be optimized for size.
8293   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8294       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
8295 
8296   // Entrance to the VPlan-native vectorization path. Outer loops are processed
8297   // here. They may require CFG and instruction level transformations before
8298   // even evaluating whether vectorization is profitable. Since we cannot modify
8299   // the incoming IR, we need to build VPlan upfront in the vectorization
8300   // pipeline.
8301   if (!L->isInnermost())
8302     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
8303                                         ORE, BFI, PSI, Hints);
8304 
8305   assert(L->isInnermost() && "Inner loop expected.");
8306 
8307   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
8308   // count by optimizing for size, to minimize overheads.
8309   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
8310   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
8311     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
8312                       << "This loop is worth vectorizing only if no scalar "
8313                       << "iteration overheads are incurred.");
8314     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
8315       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
8316     else {
8317       LLVM_DEBUG(dbgs() << "\n");
8318       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
8319     }
8320   }
8321 
8322   // Check the function attributes to see if implicit floats are allowed.
8323   // FIXME: This check doesn't seem possibly correct -- what if the loop is
8324   // an integer loop and the vector instructions selected are purely integer
8325   // vector instructions?
8326   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
8327     reportVectorizationFailure(
8328         "Can't vectorize when the NoImplicitFloat attribute is used",
8329         "loop not vectorized due to NoImplicitFloat attribute",
8330         "NoImplicitFloat", ORE, L);
8331     Hints.emitRemarkWithHints();
8332     return false;
8333   }
8334 
8335   // Check if the target supports potentially unsafe FP vectorization.
8336   // FIXME: Add a check for the type of safety issue (denormal, signaling)
8337   // for the target we're vectorizing for, to make sure none of the
8338   // additional fp-math flags can help.
8339   if (Hints.isPotentiallyUnsafe() &&
8340       TTI->isFPVectorizationPotentiallyUnsafe()) {
8341     reportVectorizationFailure(
8342         "Potentially unsafe FP op prevents vectorization",
8343         "loop not vectorized due to unsafe FP support.",
8344         "UnsafeFP", ORE, L);
8345     Hints.emitRemarkWithHints();
8346     return false;
8347   }
8348 
8349   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
8350   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
8351 
8352   // If an override option has been passed in for interleaved accesses, use it.
8353   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
8354     UseInterleaved = EnableInterleavedMemAccesses;
8355 
8356   // Analyze interleaved memory accesses.
8357   if (UseInterleaved) {
8358     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
8359   }
8360 
8361   // Use the cost model.
8362   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
8363                                 F, &Hints, IAI);
8364   CM.collectValuesToIgnore();
8365 
8366   // Use the planner for vectorization.
8367   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
8368 
8369   // Get user vectorization factor and interleave count.
8370   unsigned UserVF = Hints.getWidth();
8371   unsigned UserIC = Hints.getInterleave();
8372 
8373   // Plan how to best vectorize, return the best VF and its cost.
8374   Optional<VectorizationFactor> MaybeVF =
8375       LVP.plan(ElementCount::getFixed(UserVF), UserIC);
8376 
8377   VectorizationFactor VF = VectorizationFactor::Disabled();
8378   unsigned IC = 1;
8379 
8380   if (MaybeVF) {
8381     VF = *MaybeVF;
8382     // Select the interleave count.
8383     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
8384   }
8385 
8386   // Identify the diagnostic messages that should be produced.
8387   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8388   bool VectorizeLoop = true, InterleaveLoop = true;
8389   if (Requirements.doesNotMeet(F, L, Hints)) {
8390     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
8391                          "requirements.\n");
8392     Hints.emitRemarkWithHints();
8393     return false;
8394   }
8395 
8396   if (VF.Width == 1) {
8397     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8398     VecDiagMsg = std::make_pair(
8399         "VectorizationNotBeneficial",
8400         "the cost-model indicates that vectorization is not beneficial");
8401     VectorizeLoop = false;
8402   }
8403 
8404   if (!MaybeVF && UserIC > 1) {
8405     // Tell the user interleaving was avoided up-front, despite being explicitly
8406     // requested.
8407     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
8408                          "interleaving should be avoided up front\n");
8409     IntDiagMsg = std::make_pair(
8410         "InterleavingAvoided",
8411         "Ignoring UserIC, because interleaving was avoided up front");
8412     InterleaveLoop = false;
8413   } else if (IC == 1 && UserIC <= 1) {
8414     // Tell the user interleaving is not beneficial.
8415     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8416     IntDiagMsg = std::make_pair(
8417         "InterleavingNotBeneficial",
8418         "the cost-model indicates that interleaving is not beneficial");
8419     InterleaveLoop = false;
8420     if (UserIC == 1) {
8421       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8422       IntDiagMsg.second +=
8423           " and is explicitly disabled or interleave count is set to 1";
8424     }
8425   } else if (IC > 1 && UserIC == 1) {
8426     // Tell the user interleaving is beneficial, but it explicitly disabled.
8427     LLVM_DEBUG(
8428         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
8429     IntDiagMsg = std::make_pair(
8430         "InterleavingBeneficialButDisabled",
8431         "the cost-model indicates that interleaving is beneficial "
8432         "but is explicitly disabled or interleave count is set to 1");
8433     InterleaveLoop = false;
8434   }
8435 
8436   // Override IC if user provided an interleave count.
8437   IC = UserIC > 0 ? UserIC : IC;
8438 
8439   // Emit diagnostic messages, if any.
8440   const char *VAPassName = Hints.vectorizeAnalysisPassName();
8441   if (!VectorizeLoop && !InterleaveLoop) {
8442     // Do not vectorize or interleaving the loop.
8443     ORE->emit([&]() {
8444       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
8445                                       L->getStartLoc(), L->getHeader())
8446              << VecDiagMsg.second;
8447     });
8448     ORE->emit([&]() {
8449       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8450                                       L->getStartLoc(), L->getHeader())
8451              << IntDiagMsg.second;
8452     });
8453     return false;
8454   } else if (!VectorizeLoop && InterleaveLoop) {
8455     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8456     ORE->emit([&]() {
8457       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
8458                                         L->getStartLoc(), L->getHeader())
8459              << VecDiagMsg.second;
8460     });
8461   } else if (VectorizeLoop && !InterleaveLoop) {
8462     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8463                       << ") in " << DebugLocStr << '\n');
8464     ORE->emit([&]() {
8465       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8466                                         L->getStartLoc(), L->getHeader())
8467              << IntDiagMsg.second;
8468     });
8469   } else if (VectorizeLoop && InterleaveLoop) {
8470     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8471                       << ") in " << DebugLocStr << '\n');
8472     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8473   }
8474 
8475   LVP.setBestPlan(VF.Width, IC);
8476 
8477   using namespace ore;
8478   bool DisableRuntimeUnroll = false;
8479   MDNode *OrigLoopID = L->getLoopID();
8480 
8481   if (!VectorizeLoop) {
8482     assert(IC > 1 && "interleave count should not be 1 or 0");
8483     // If we decided that it is not legal to vectorize the loop, then
8484     // interleave it.
8485     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
8486                                BFI, PSI);
8487     LVP.executePlan(Unroller, DT);
8488 
8489     ORE->emit([&]() {
8490       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8491                                 L->getHeader())
8492              << "interleaved loop (interleaved count: "
8493              << NV("InterleaveCount", IC) << ")";
8494     });
8495   } else {
8496     // If we decided that it is *legal* to vectorize the loop, then do it.
8497     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
8498                            &LVL, &CM, BFI, PSI);
8499     LVP.executePlan(LB, DT);
8500     ++LoopsVectorized;
8501 
8502     // Add metadata to disable runtime unrolling a scalar loop when there are
8503     // no runtime checks about strides and memory. A scalar loop that is
8504     // rarely used is not worth unrolling.
8505     if (!LB.areSafetyChecksAdded())
8506       DisableRuntimeUnroll = true;
8507 
8508     // Report the vectorization decision.
8509     ORE->emit([&]() {
8510       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
8511                                 L->getHeader())
8512              << "vectorized loop (vectorization width: "
8513              << NV("VectorizationFactor", VF.Width)
8514              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
8515     });
8516   }
8517 
8518   Optional<MDNode *> RemainderLoopID =
8519       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
8520                                       LLVMLoopVectorizeFollowupEpilogue});
8521   if (RemainderLoopID.hasValue()) {
8522     L->setLoopID(RemainderLoopID.getValue());
8523   } else {
8524     if (DisableRuntimeUnroll)
8525       AddRuntimeUnrollDisableMetaData(L);
8526 
8527     // Mark the loop as already vectorized to avoid vectorizing again.
8528     Hints.setAlreadyVectorized();
8529   }
8530 
8531   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8532   return true;
8533 }
8534 
8535 LoopVectorizeResult LoopVectorizePass::runImpl(
8536     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
8537     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
8538     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
8539     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
8540     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
8541   SE = &SE_;
8542   LI = &LI_;
8543   TTI = &TTI_;
8544   DT = &DT_;
8545   BFI = &BFI_;
8546   TLI = TLI_;
8547   AA = &AA_;
8548   AC = &AC_;
8549   GetLAA = &GetLAA_;
8550   DB = &DB_;
8551   ORE = &ORE_;
8552   PSI = PSI_;
8553 
8554   // Don't attempt if
8555   // 1. the target claims to have no vector registers, and
8556   // 2. interleaving won't help ILP.
8557   //
8558   // The second condition is necessary because, even if the target has no
8559   // vector registers, loop vectorization may still enable scalar
8560   // interleaving.
8561   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
8562       TTI->getMaxInterleaveFactor(1) < 2)
8563     return LoopVectorizeResult(false, false);
8564 
8565   bool Changed = false, CFGChanged = false;
8566 
8567   // The vectorizer requires loops to be in simplified form.
8568   // Since simplification may add new inner loops, it has to run before the
8569   // legality and profitability checks. This means running the loop vectorizer
8570   // will simplify all loops, regardless of whether anything end up being
8571   // vectorized.
8572   for (auto &L : *LI)
8573     Changed |= CFGChanged |=
8574         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
8575 
8576   // Build up a worklist of inner-loops to vectorize. This is necessary as
8577   // the act of vectorizing or partially unrolling a loop creates new loops
8578   // and can invalidate iterators across the loops.
8579   SmallVector<Loop *, 8> Worklist;
8580 
8581   for (Loop *L : *LI)
8582     collectSupportedLoops(*L, LI, ORE, Worklist);
8583 
8584   LoopsAnalyzed += Worklist.size();
8585 
8586   // Now walk the identified inner loops.
8587   while (!Worklist.empty()) {
8588     Loop *L = Worklist.pop_back_val();
8589 
8590     // For the inner loops we actually process, form LCSSA to simplify the
8591     // transform.
8592     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
8593 
8594     Changed |= CFGChanged |= processLoop(L);
8595   }
8596 
8597   // Process each loop nest in the function.
8598   return LoopVectorizeResult(Changed, CFGChanged);
8599 }
8600 
8601 PreservedAnalyses LoopVectorizePass::run(Function &F,
8602                                          FunctionAnalysisManager &AM) {
8603     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
8604     auto &LI = AM.getResult<LoopAnalysis>(F);
8605     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
8606     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
8607     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
8608     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
8609     auto &AA = AM.getResult<AAManager>(F);
8610     auto &AC = AM.getResult<AssumptionAnalysis>(F);
8611     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
8612     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
8613     MemorySSA *MSSA = EnableMSSALoopDependency
8614                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
8615                           : nullptr;
8616 
8617     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
8618     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
8619         [&](Loop &L) -> const LoopAccessInfo & {
8620       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
8621                                         TLI, TTI, nullptr, MSSA};
8622       return LAM.getResult<LoopAccessAnalysis>(L, AR);
8623     };
8624     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
8625     ProfileSummaryInfo *PSI =
8626         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
8627     LoopVectorizeResult Result =
8628         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
8629     if (!Result.MadeAnyChange)
8630       return PreservedAnalyses::all();
8631     PreservedAnalyses PA;
8632 
8633     // We currently do not preserve loopinfo/dominator analyses with outer loop
8634     // vectorization. Until this is addressed, mark these analyses as preserved
8635     // only for non-VPlan-native path.
8636     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
8637     if (!EnableVPlanNativePath) {
8638       PA.preserve<LoopAnalysis>();
8639       PA.preserve<DominatorTreeAnalysis>();
8640     }
8641     PA.preserve<BasicAA>();
8642     PA.preserve<GlobalsAA>();
8643     if (!Result.MadeCFGChange)
8644       PA.preserveSet<CFGAnalyses>();
8645     return PA;
8646 }
8647