1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142 #include "llvm/Transforms/Utils/SizeOpts.h"
143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144 #include <algorithm>
145 #include <cassert>
146 #include <cstdint>
147 #include <cstdlib>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
202 // that predication is preferred, and this lists all options. I.e., the
203 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
204 // and predicate the instructions accordingly. If tail-folding fails, there are
205 // different fallback strategies depending on these values:
206 namespace PreferPredicateTy {
207   enum Option {
208     ScalarEpilogue = 0,
209     PredicateElseScalarEpilogue,
210     PredicateOrDontVectorize
211   };
212 } // namespace PreferPredicateTy
213 
214 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
215     "prefer-predicate-over-epilogue",
216     cl::init(PreferPredicateTy::ScalarEpilogue),
217     cl::Hidden,
218     cl::desc("Tail-folding and predication preferences over creating a scalar "
219              "epilogue loop."),
220     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
221                          "scalar-epilogue",
222                          "Don't tail-predicate loops, create scalar epilogue"),
223               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
224                          "predicate-else-scalar-epilogue",
225                          "prefer tail-folding, create scalar epilogue if tail "
226                          "folding fails."),
227               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
228                          "predicate-dont-vectorize",
229                          "prefers tail-folding, don't attempt vectorization if "
230                          "tail-folding fails.")));
231 
232 static cl::opt<bool> MaximizeBandwidth(
233     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
234     cl::desc("Maximize bandwidth when selecting vectorization factor which "
235              "will be determined by the smallest type in loop."));
236 
237 static cl::opt<bool> EnableInterleavedMemAccesses(
238     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
239     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
240 
241 /// An interleave-group may need masking if it resides in a block that needs
242 /// predication, or in order to mask away gaps.
243 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
244     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
245     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
246 
247 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
248     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
249     cl::desc("We don't interleave loops with a estimated constant trip count "
250              "below this number"));
251 
252 static cl::opt<unsigned> ForceTargetNumScalarRegs(
253     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
254     cl::desc("A flag that overrides the target's number of scalar registers."));
255 
256 static cl::opt<unsigned> ForceTargetNumVectorRegs(
257     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
258     cl::desc("A flag that overrides the target's number of vector registers."));
259 
260 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
261     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
262     cl::desc("A flag that overrides the target's max interleave factor for "
263              "scalar loops."));
264 
265 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
266     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
267     cl::desc("A flag that overrides the target's max interleave factor for "
268              "vectorized loops."));
269 
270 static cl::opt<unsigned> ForceTargetInstructionCost(
271     "force-target-instruction-cost", cl::init(0), cl::Hidden,
272     cl::desc("A flag that overrides the target's expected cost for "
273              "an instruction to a single constant value. Mostly "
274              "useful for getting consistent testing."));
275 
276 static cl::opt<bool> ForceTargetSupportsScalableVectors(
277     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
278     cl::desc(
279         "Pretend that scalable vectors are supported, even if the target does "
280         "not support them. This flag should only be used for testing."));
281 
282 static cl::opt<unsigned> SmallLoopCost(
283     "small-loop-cost", cl::init(20), cl::Hidden,
284     cl::desc(
285         "The cost of a loop that is considered 'small' by the interleaver."));
286 
287 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
288     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
289     cl::desc("Enable the use of the block frequency analysis to access PGO "
290              "heuristics minimizing code growth in cold regions and being more "
291              "aggressive in hot regions."));
292 
293 // Runtime interleave loops for load/store throughput.
294 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
295     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
296     cl::desc(
297         "Enable runtime interleaving until load/store ports are saturated"));
298 
299 /// Interleave small loops with scalar reductions.
300 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
301     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
302     cl::desc("Enable interleaving for loops with small iteration counts that "
303              "contain scalar reductions to expose ILP."));
304 
305 /// The number of stores in a loop that are allowed to need predication.
306 static cl::opt<unsigned> NumberOfStoresToPredicate(
307     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
308     cl::desc("Max number of stores to be predicated behind an if."));
309 
310 static cl::opt<bool> EnableIndVarRegisterHeur(
311     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
312     cl::desc("Count the induction variable only once when interleaving"));
313 
314 static cl::opt<bool> EnableCondStoresVectorization(
315     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
316     cl::desc("Enable if predication of stores during vectorization."));
317 
318 static cl::opt<unsigned> MaxNestedScalarReductionIC(
319     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
320     cl::desc("The maximum interleave count to use when interleaving a scalar "
321              "reduction in a nested loop."));
322 
323 static cl::opt<bool>
324     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
325                            cl::Hidden,
326                            cl::desc("Prefer in-loop vector reductions, "
327                                     "overriding the targets preference."));
328 
329 static cl::opt<bool> PreferPredicatedReductionSelect(
330     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
331     cl::desc(
332         "Prefer predicating a reduction operation over an after loop select."));
333 
334 cl::opt<bool> EnableVPlanNativePath(
335     "enable-vplan-native-path", cl::init(false), cl::Hidden,
336     cl::desc("Enable VPlan-native vectorization path with "
337              "support for outer loop vectorization."));
338 
339 // FIXME: Remove this switch once we have divergence analysis. Currently we
340 // assume divergent non-backedge branches when this switch is true.
341 cl::opt<bool> EnableVPlanPredication(
342     "enable-vplan-predication", cl::init(false), cl::Hidden,
343     cl::desc("Enable VPlan-native vectorization path predicator with "
344              "support for outer loop vectorization."));
345 
346 // This flag enables the stress testing of the VPlan H-CFG construction in the
347 // VPlan-native vectorization path. It must be used in conjuction with
348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
349 // verification of the H-CFGs built.
350 static cl::opt<bool> VPlanBuildStressTest(
351     "vplan-build-stress-test", cl::init(false), cl::Hidden,
352     cl::desc(
353         "Build VPlan for every supported loop nest in the function and bail "
354         "out right after the build (stress test the VPlan H-CFG construction "
355         "in the VPlan-native vectorization path)."));
356 
357 cl::opt<bool> llvm::EnableLoopInterleaving(
358     "interleave-loops", cl::init(true), cl::Hidden,
359     cl::desc("Enable loop interleaving in Loop vectorization passes"));
360 cl::opt<bool> llvm::EnableLoopVectorization(
361     "vectorize-loops", cl::init(true), cl::Hidden,
362     cl::desc("Run the Loop vectorization passes"));
363 
364 /// A helper function that returns the type of loaded or stored value.
365 static Type *getMemInstValueType(Value *I) {
366   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
367          "Expected Load or Store instruction");
368   if (auto *LI = dyn_cast<LoadInst>(I))
369     return LI->getType();
370   return cast<StoreInst>(I)->getValueOperand()->getType();
371 }
372 
373 /// A helper function that returns true if the given type is irregular. The
374 /// type is irregular if its allocated size doesn't equal the store size of an
375 /// element of the corresponding vector type at the given vectorization factor.
376 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
377   // Determine if an array of VF elements of type Ty is "bitcast compatible"
378   // with a <VF x Ty> vector.
379   if (VF.isVector()) {
380     auto *VectorTy = VectorType::get(Ty, VF);
381     return TypeSize::get(VF.getKnownMinValue() *
382                              DL.getTypeAllocSize(Ty).getFixedValue(),
383                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
384   }
385 
386   // If the vectorization factor is one, we just check if an array of type Ty
387   // requires padding between elements.
388   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
389 }
390 
391 /// A helper function that returns the reciprocal of the block probability of
392 /// predicated blocks. If we return X, we are assuming the predicated block
393 /// will execute once for every X iterations of the loop header.
394 ///
395 /// TODO: We should use actual block probability here, if available. Currently,
396 ///       we always assume predicated blocks have a 50% chance of executing.
397 static unsigned getReciprocalPredBlockProb() { return 2; }
398 
399 /// A helper function that adds a 'fast' flag to floating-point operations.
400 static Value *addFastMathFlag(Value *V) {
401   if (isa<FPMathOperator>(V))
402     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
403   return V;
404 }
405 
406 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
407   if (isa<FPMathOperator>(V))
408     cast<Instruction>(V)->setFastMathFlags(FMF);
409   return V;
410 }
411 
412 /// A helper function that returns an integer or floating-point constant with
413 /// value C.
414 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
415   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
416                            : ConstantFP::get(Ty, C);
417 }
418 
419 /// Returns "best known" trip count for the specified loop \p L as defined by
420 /// the following procedure:
421 ///   1) Returns exact trip count if it is known.
422 ///   2) Returns expected trip count according to profile data if any.
423 ///   3) Returns upper bound estimate if it is known.
424 ///   4) Returns None if all of the above failed.
425 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
426   // Check if exact trip count is known.
427   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
428     return ExpectedTC;
429 
430   // Check if there is an expected trip count available from profile data.
431   if (LoopVectorizeWithBlockFrequency)
432     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
433       return EstimatedTC;
434 
435   // Check if upper bound estimate is known.
436   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
437     return ExpectedTC;
438 
439   return None;
440 }
441 
442 namespace llvm {
443 
444 /// InnerLoopVectorizer vectorizes loops which contain only one basic
445 /// block to a specified vectorization factor (VF).
446 /// This class performs the widening of scalars into vectors, or multiple
447 /// scalars. This class also implements the following features:
448 /// * It inserts an epilogue loop for handling loops that don't have iteration
449 ///   counts that are known to be a multiple of the vectorization factor.
450 /// * It handles the code generation for reduction variables.
451 /// * Scalarization (implementation using scalars) of un-vectorizable
452 ///   instructions.
453 /// InnerLoopVectorizer does not perform any vectorization-legality
454 /// checks, and relies on the caller to check for the different legality
455 /// aspects. The InnerLoopVectorizer relies on the
456 /// LoopVectorizationLegality class to provide information about the induction
457 /// and reduction variables that were found to a given vectorization factor.
458 class InnerLoopVectorizer {
459 public:
460   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
461                       LoopInfo *LI, DominatorTree *DT,
462                       const TargetLibraryInfo *TLI,
463                       const TargetTransformInfo *TTI, AssumptionCache *AC,
464                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
465                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
466                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
467                       ProfileSummaryInfo *PSI)
468       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
469         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
470         Builder(PSE.getSE()->getContext()),
471         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
472         BFI(BFI), PSI(PSI) {
473     // Query this against the original loop and save it here because the profile
474     // of the original loop header may change as the transformation happens.
475     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
476         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
477   }
478 
479   virtual ~InnerLoopVectorizer() = default;
480 
481   /// Create a new empty loop that will contain vectorized instructions later
482   /// on, while the old loop will be used as the scalar remainder. Control flow
483   /// is generated around the vectorized (and scalar epilogue) loops consisting
484   /// of various checks and bypasses. Return the pre-header block of the new
485   /// loop.
486   /// In the case of epilogue vectorization, this function is overriden to
487   /// handle the more complex control flow around the loops.
488   virtual BasicBlock *createVectorizedLoopSkeleton();
489 
490   /// Widen a single instruction within the innermost loop.
491   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
492                         VPTransformState &State);
493 
494   /// Widen a single call instruction within the innermost loop.
495   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
496                             VPTransformState &State);
497 
498   /// Widen a single select instruction within the innermost loop.
499   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
500                               bool InvariantCond, VPTransformState &State);
501 
502   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
503   void fixVectorizedLoop();
504 
505   // Return true if any runtime check is added.
506   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
507 
508   /// A type for vectorized values in the new loop. Each value from the
509   /// original loop, when vectorized, is represented by UF vector values in the
510   /// new unrolled loop, where UF is the unroll factor.
511   using VectorParts = SmallVector<Value *, 2>;
512 
513   /// Vectorize a single GetElementPtrInst based on information gathered and
514   /// decisions taken during planning.
515   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
516                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
517                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
518 
519   /// Vectorize a single PHINode in a block. This method handles the induction
520   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
521   /// arbitrary length vectors.
522   void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
523                            Value *StartV, unsigned UF, ElementCount VF);
524 
525   /// A helper function to scalarize a single Instruction in the innermost loop.
526   /// Generates a sequence of scalar instances for each lane between \p MinLane
527   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
528   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
529   /// Instr's operands.
530   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
531                             const VPIteration &Instance, bool IfPredicateInstr,
532                             VPTransformState &State);
533 
534   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
535   /// is provided, the integer induction variable will first be truncated to
536   /// the corresponding type.
537   void widenIntOrFpInduction(PHINode *IV, Value *Start,
538                              TruncInst *Trunc = nullptr);
539 
540   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
541   /// vector or scalar value on-demand if one is not yet available. When
542   /// vectorizing a loop, we visit the definition of an instruction before its
543   /// uses. When visiting the definition, we either vectorize or scalarize the
544   /// instruction, creating an entry for it in the corresponding map. (In some
545   /// cases, such as induction variables, we will create both vector and scalar
546   /// entries.) Then, as we encounter uses of the definition, we derive values
547   /// for each scalar or vector use unless such a value is already available.
548   /// For example, if we scalarize a definition and one of its uses is vector,
549   /// we build the required vector on-demand with an insertelement sequence
550   /// when visiting the use. Otherwise, if the use is scalar, we can use the
551   /// existing scalar definition.
552   ///
553   /// Return a value in the new loop corresponding to \p V from the original
554   /// loop at unroll index \p Part. If the value has already been vectorized,
555   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
556   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
557   /// a new vector value on-demand by inserting the scalar values into a vector
558   /// with an insertelement sequence. If the value has been neither vectorized
559   /// nor scalarized, it must be loop invariant, so we simply broadcast the
560   /// value into a vector.
561   Value *getOrCreateVectorValue(Value *V, unsigned Part);
562 
563   void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
564     VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
565   }
566 
567   /// Return a value in the new loop corresponding to \p V from the original
568   /// loop at unroll and vector indices \p Instance. If the value has been
569   /// vectorized but not scalarized, the necessary extractelement instruction
570   /// will be generated.
571   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
572 
573   /// Construct the vector value of a scalarized value \p V one lane at a time.
574   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
575 
576   /// Try to vectorize interleaved access group \p Group with the base address
577   /// given in \p Addr, optionally masking the vector operations if \p
578   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
579   /// values in the vectorized loop.
580   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
581                                 ArrayRef<VPValue *> VPDefs,
582                                 VPTransformState &State, VPValue *Addr,
583                                 ArrayRef<VPValue *> StoredValues,
584                                 VPValue *BlockInMask = nullptr);
585 
586   /// Vectorize Load and Store instructions with the base address given in \p
587   /// Addr, optionally masking the vector operations if \p BlockInMask is
588   /// non-null. Use \p State to translate given VPValues to IR values in the
589   /// vectorized loop.
590   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
591                                   VPValue *Def, VPValue *Addr,
592                                   VPValue *StoredValue, VPValue *BlockInMask);
593 
594   /// Set the debug location in the builder using the debug location in
595   /// the instruction.
596   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
597 
598   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
599   void fixNonInductionPHIs(void);
600 
601 protected:
602   friend class LoopVectorizationPlanner;
603 
604   /// A small list of PHINodes.
605   using PhiVector = SmallVector<PHINode *, 4>;
606 
607   /// A type for scalarized values in the new loop. Each value from the
608   /// original loop, when scalarized, is represented by UF x VF scalar values
609   /// in the new unrolled loop, where UF is the unroll factor and VF is the
610   /// vectorization factor.
611   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
612 
613   /// Set up the values of the IVs correctly when exiting the vector loop.
614   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
615                     Value *CountRoundDown, Value *EndValue,
616                     BasicBlock *MiddleBlock);
617 
618   /// Create a new induction variable inside L.
619   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
620                                    Value *Step, Instruction *DL);
621 
622   /// Handle all cross-iteration phis in the header.
623   void fixCrossIterationPHIs();
624 
625   /// Fix a first-order recurrence. This is the second phase of vectorizing
626   /// this phi node.
627   void fixFirstOrderRecurrence(PHINode *Phi);
628 
629   /// Fix a reduction cross-iteration phi. This is the second phase of
630   /// vectorizing this phi node.
631   void fixReduction(PHINode *Phi);
632 
633   /// Clear NSW/NUW flags from reduction instructions if necessary.
634   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
635 
636   /// The Loop exit block may have single value PHI nodes with some
637   /// incoming value. While vectorizing we only handled real values
638   /// that were defined inside the loop and we should have one value for
639   /// each predecessor of its parent basic block. See PR14725.
640   void fixLCSSAPHIs();
641 
642   /// Iteratively sink the scalarized operands of a predicated instruction into
643   /// the block that was created for it.
644   void sinkScalarOperands(Instruction *PredInst);
645 
646   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
647   /// represented as.
648   void truncateToMinimalBitwidths();
649 
650   /// Create a broadcast instruction. This method generates a broadcast
651   /// instruction (shuffle) for loop invariant values and for the induction
652   /// value. If this is the induction variable then we extend it to N, N+1, ...
653   /// this is needed because each iteration in the loop corresponds to a SIMD
654   /// element.
655   virtual Value *getBroadcastInstrs(Value *V);
656 
657   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
658   /// to each vector element of Val. The sequence starts at StartIndex.
659   /// \p Opcode is relevant for FP induction variable.
660   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
661                                Instruction::BinaryOps Opcode =
662                                Instruction::BinaryOpsEnd);
663 
664   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
665   /// variable on which to base the steps, \p Step is the size of the step, and
666   /// \p EntryVal is the value from the original loop that maps to the steps.
667   /// Note that \p EntryVal doesn't have to be an induction variable - it
668   /// can also be a truncate instruction.
669   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
670                         const InductionDescriptor &ID);
671 
672   /// Create a vector induction phi node based on an existing scalar one. \p
673   /// EntryVal is the value from the original loop that maps to the vector phi
674   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
675   /// truncate instruction, instead of widening the original IV, we widen a
676   /// version of the IV truncated to \p EntryVal's type.
677   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
678                                        Value *Step, Value *Start,
679                                        Instruction *EntryVal);
680 
681   /// Returns true if an instruction \p I should be scalarized instead of
682   /// vectorized for the chosen vectorization factor.
683   bool shouldScalarizeInstruction(Instruction *I) const;
684 
685   /// Returns true if we should generate a scalar version of \p IV.
686   bool needsScalarInduction(Instruction *IV) const;
687 
688   /// If there is a cast involved in the induction variable \p ID, which should
689   /// be ignored in the vectorized loop body, this function records the
690   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
691   /// cast. We had already proved that the casted Phi is equal to the uncasted
692   /// Phi in the vectorized loop (under a runtime guard), and therefore
693   /// there is no need to vectorize the cast - the same value can be used in the
694   /// vector loop for both the Phi and the cast.
695   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
696   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
697   ///
698   /// \p EntryVal is the value from the original loop that maps to the vector
699   /// phi node and is used to distinguish what is the IV currently being
700   /// processed - original one (if \p EntryVal is a phi corresponding to the
701   /// original IV) or the "newly-created" one based on the proof mentioned above
702   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
703   /// latter case \p EntryVal is a TruncInst and we must not record anything for
704   /// that IV, but it's error-prone to expect callers of this routine to care
705   /// about that, hence this explicit parameter.
706   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
707                                              const Instruction *EntryVal,
708                                              Value *VectorLoopValue,
709                                              unsigned Part,
710                                              unsigned Lane = UINT_MAX);
711 
712   /// Generate a shuffle sequence that will reverse the vector Vec.
713   virtual Value *reverseVector(Value *Vec);
714 
715   /// Returns (and creates if needed) the original loop trip count.
716   Value *getOrCreateTripCount(Loop *NewLoop);
717 
718   /// Returns (and creates if needed) the trip count of the widened loop.
719   Value *getOrCreateVectorTripCount(Loop *NewLoop);
720 
721   /// Returns a bitcasted value to the requested vector type.
722   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
723   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
724                                 const DataLayout &DL);
725 
726   /// Emit a bypass check to see if the vector trip count is zero, including if
727   /// it overflows.
728   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
729 
730   /// Emit a bypass check to see if all of the SCEV assumptions we've
731   /// had to make are correct.
732   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
733 
734   /// Emit bypass checks to check any memory assumptions we may have made.
735   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
736 
737   /// Compute the transformed value of Index at offset StartValue using step
738   /// StepValue.
739   /// For integer induction, returns StartValue + Index * StepValue.
740   /// For pointer induction, returns StartValue[Index * StepValue].
741   /// FIXME: The newly created binary instructions should contain nsw/nuw
742   /// flags, which can be found from the original scalar operations.
743   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
744                               const DataLayout &DL,
745                               const InductionDescriptor &ID) const;
746 
747   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
748   /// vector loop preheader, middle block and scalar preheader. Also
749   /// allocate a loop object for the new vector loop and return it.
750   Loop *createVectorLoopSkeleton(StringRef Prefix);
751 
752   /// Create new phi nodes for the induction variables to resume iteration count
753   /// in the scalar epilogue, from where the vectorized loop left off (given by
754   /// \p VectorTripCount).
755   /// In cases where the loop skeleton is more complicated (eg. epilogue
756   /// vectorization) and the resume values can come from an additional bypass
757   /// block, the \p AdditionalBypass pair provides information about the bypass
758   /// block and the end value on the edge from bypass to this loop.
759   void createInductionResumeValues(
760       Loop *L, Value *VectorTripCount,
761       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
762 
763   /// Complete the loop skeleton by adding debug MDs, creating appropriate
764   /// conditional branches in the middle block, preparing the builder and
765   /// running the verifier. Take in the vector loop \p L as argument, and return
766   /// the preheader of the completed vector loop.
767   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
768 
769   /// Add additional metadata to \p To that was not present on \p Orig.
770   ///
771   /// Currently this is used to add the noalias annotations based on the
772   /// inserted memchecks.  Use this for instructions that are *cloned* into the
773   /// vector loop.
774   void addNewMetadata(Instruction *To, const Instruction *Orig);
775 
776   /// Add metadata from one instruction to another.
777   ///
778   /// This includes both the original MDs from \p From and additional ones (\see
779   /// addNewMetadata).  Use this for *newly created* instructions in the vector
780   /// loop.
781   void addMetadata(Instruction *To, Instruction *From);
782 
783   /// Similar to the previous function but it adds the metadata to a
784   /// vector of instructions.
785   void addMetadata(ArrayRef<Value *> To, Instruction *From);
786 
787   /// Allow subclasses to override and print debug traces before/after vplan
788   /// execution, when trace information is requested.
789   virtual void printDebugTracesAtStart(){};
790   virtual void printDebugTracesAtEnd(){};
791 
792   /// The original loop.
793   Loop *OrigLoop;
794 
795   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
796   /// dynamic knowledge to simplify SCEV expressions and converts them to a
797   /// more usable form.
798   PredicatedScalarEvolution &PSE;
799 
800   /// Loop Info.
801   LoopInfo *LI;
802 
803   /// Dominator Tree.
804   DominatorTree *DT;
805 
806   /// Alias Analysis.
807   AAResults *AA;
808 
809   /// Target Library Info.
810   const TargetLibraryInfo *TLI;
811 
812   /// Target Transform Info.
813   const TargetTransformInfo *TTI;
814 
815   /// Assumption Cache.
816   AssumptionCache *AC;
817 
818   /// Interface to emit optimization remarks.
819   OptimizationRemarkEmitter *ORE;
820 
821   /// LoopVersioning.  It's only set up (non-null) if memchecks were
822   /// used.
823   ///
824   /// This is currently only used to add no-alias metadata based on the
825   /// memchecks.  The actually versioning is performed manually.
826   std::unique_ptr<LoopVersioning> LVer;
827 
828   /// The vectorization SIMD factor to use. Each vector will have this many
829   /// vector elements.
830   ElementCount VF;
831 
832   /// The vectorization unroll factor to use. Each scalar is vectorized to this
833   /// many different vector instructions.
834   unsigned UF;
835 
836   /// The builder that we use
837   IRBuilder<> Builder;
838 
839   // --- Vectorization state ---
840 
841   /// The vector-loop preheader.
842   BasicBlock *LoopVectorPreHeader;
843 
844   /// The scalar-loop preheader.
845   BasicBlock *LoopScalarPreHeader;
846 
847   /// Middle Block between the vector and the scalar.
848   BasicBlock *LoopMiddleBlock;
849 
850   /// The (unique) ExitBlock of the scalar loop.  Note that
851   /// there can be multiple exiting edges reaching this block.
852   BasicBlock *LoopExitBlock;
853 
854   /// The vector loop body.
855   BasicBlock *LoopVectorBody;
856 
857   /// The scalar loop body.
858   BasicBlock *LoopScalarBody;
859 
860   /// A list of all bypass blocks. The first block is the entry of the loop.
861   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
862 
863   /// The new Induction variable which was added to the new block.
864   PHINode *Induction = nullptr;
865 
866   /// The induction variable of the old basic block.
867   PHINode *OldInduction = nullptr;
868 
869   /// Maps values from the original loop to their corresponding values in the
870   /// vectorized loop. A key value can map to either vector values, scalar
871   /// values or both kinds of values, depending on whether the key was
872   /// vectorized and scalarized.
873   VectorizerValueMap VectorLoopValueMap;
874 
875   /// Store instructions that were predicated.
876   SmallVector<Instruction *, 4> PredicatedInstructions;
877 
878   /// Trip count of the original loop.
879   Value *TripCount = nullptr;
880 
881   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
882   Value *VectorTripCount = nullptr;
883 
884   /// The legality analysis.
885   LoopVectorizationLegality *Legal;
886 
887   /// The profitablity analysis.
888   LoopVectorizationCostModel *Cost;
889 
890   // Record whether runtime checks are added.
891   bool AddedSafetyChecks = false;
892 
893   // Holds the end values for each induction variable. We save the end values
894   // so we can later fix-up the external users of the induction variables.
895   DenseMap<PHINode *, Value *> IVEndValues;
896 
897   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
898   // fixed up at the end of vector code generation.
899   SmallVector<PHINode *, 8> OrigPHIsToFix;
900 
901   /// BFI and PSI are used to check for profile guided size optimizations.
902   BlockFrequencyInfo *BFI;
903   ProfileSummaryInfo *PSI;
904 
905   // Whether this loop should be optimized for size based on profile guided size
906   // optimizatios.
907   bool OptForSizeBasedOnProfile;
908 };
909 
910 class InnerLoopUnroller : public InnerLoopVectorizer {
911 public:
912   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
913                     LoopInfo *LI, DominatorTree *DT,
914                     const TargetLibraryInfo *TLI,
915                     const TargetTransformInfo *TTI, AssumptionCache *AC,
916                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
917                     LoopVectorizationLegality *LVL,
918                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
919                     ProfileSummaryInfo *PSI)
920       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
921                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
922                             BFI, PSI) {}
923 
924 private:
925   Value *getBroadcastInstrs(Value *V) override;
926   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
927                        Instruction::BinaryOps Opcode =
928                        Instruction::BinaryOpsEnd) override;
929   Value *reverseVector(Value *Vec) override;
930 };
931 
932 /// Encapsulate information regarding vectorization of a loop and its epilogue.
933 /// This information is meant to be updated and used across two stages of
934 /// epilogue vectorization.
935 struct EpilogueLoopVectorizationInfo {
936   ElementCount MainLoopVF = ElementCount::getFixed(0);
937   unsigned MainLoopUF = 0;
938   ElementCount EpilogueVF = ElementCount::getFixed(0);
939   unsigned EpilogueUF = 0;
940   BasicBlock *MainLoopIterationCountCheck = nullptr;
941   BasicBlock *EpilogueIterationCountCheck = nullptr;
942   BasicBlock *SCEVSafetyCheck = nullptr;
943   BasicBlock *MemSafetyCheck = nullptr;
944   Value *TripCount = nullptr;
945   Value *VectorTripCount = nullptr;
946 
947   EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
948                                 unsigned EUF)
949       : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
950         EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
951     assert(EUF == 1 &&
952            "A high UF for the epilogue loop is likely not beneficial.");
953   }
954 };
955 
956 /// An extension of the inner loop vectorizer that creates a skeleton for a
957 /// vectorized loop that has its epilogue (residual) also vectorized.
958 /// The idea is to run the vplan on a given loop twice, firstly to setup the
959 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
960 /// from the first step and vectorize the epilogue.  This is achieved by
961 /// deriving two concrete strategy classes from this base class and invoking
962 /// them in succession from the loop vectorizer planner.
963 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
964 public:
965   InnerLoopAndEpilogueVectorizer(
966       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
967       DominatorTree *DT, const TargetLibraryInfo *TLI,
968       const TargetTransformInfo *TTI, AssumptionCache *AC,
969       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
970       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
971       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
972       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
973                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
974         EPI(EPI) {}
975 
976   // Override this function to handle the more complex control flow around the
977   // three loops.
978   BasicBlock *createVectorizedLoopSkeleton() final override {
979     return createEpilogueVectorizedLoopSkeleton();
980   }
981 
982   /// The interface for creating a vectorized skeleton using one of two
983   /// different strategies, each corresponding to one execution of the vplan
984   /// as described above.
985   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
986 
987   /// Holds and updates state information required to vectorize the main loop
988   /// and its epilogue in two separate passes. This setup helps us avoid
989   /// regenerating and recomputing runtime safety checks. It also helps us to
990   /// shorten the iteration-count-check path length for the cases where the
991   /// iteration count of the loop is so small that the main vector loop is
992   /// completely skipped.
993   EpilogueLoopVectorizationInfo &EPI;
994 };
995 
996 /// A specialized derived class of inner loop vectorizer that performs
997 /// vectorization of *main* loops in the process of vectorizing loops and their
998 /// epilogues.
999 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
1000 public:
1001   EpilogueVectorizerMainLoop(
1002       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
1003       DominatorTree *DT, const TargetLibraryInfo *TLI,
1004       const TargetTransformInfo *TTI, AssumptionCache *AC,
1005       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
1006       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
1007       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
1008       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1009                                        EPI, LVL, CM, BFI, PSI) {}
1010   /// Implements the interface for creating a vectorized skeleton using the
1011   /// *main loop* strategy (ie the first pass of vplan execution).
1012   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1013 
1014 protected:
1015   /// Emits an iteration count bypass check once for the main loop (when \p
1016   /// ForEpilogue is false) and once for the epilogue loop (when \p
1017   /// ForEpilogue is true).
1018   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
1019                                              bool ForEpilogue);
1020   void printDebugTracesAtStart() override;
1021   void printDebugTracesAtEnd() override;
1022 };
1023 
1024 // A specialized derived class of inner loop vectorizer that performs
1025 // vectorization of *epilogue* loops in the process of vectorizing loops and
1026 // their epilogues.
1027 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
1028 public:
1029   EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
1030                     LoopInfo *LI, DominatorTree *DT,
1031                     const TargetLibraryInfo *TLI,
1032                     const TargetTransformInfo *TTI, AssumptionCache *AC,
1033                     OptimizationRemarkEmitter *ORE,
1034                     EpilogueLoopVectorizationInfo &EPI,
1035                     LoopVectorizationLegality *LVL,
1036                     llvm::LoopVectorizationCostModel *CM,
1037                     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
1038       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1039                                        EPI, LVL, CM, BFI, PSI) {}
1040   /// Implements the interface for creating a vectorized skeleton using the
1041   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1042   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1043 
1044 protected:
1045   /// Emits an iteration count bypass check after the main vector loop has
1046   /// finished to see if there are any iterations left to execute by either
1047   /// the vector epilogue or the scalar epilogue.
1048   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1049                                                       BasicBlock *Bypass,
1050                                                       BasicBlock *Insert);
1051   void printDebugTracesAtStart() override;
1052   void printDebugTracesAtEnd() override;
1053 };
1054 } // end namespace llvm
1055 
1056 /// Look for a meaningful debug location on the instruction or it's
1057 /// operands.
1058 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1059   if (!I)
1060     return I;
1061 
1062   DebugLoc Empty;
1063   if (I->getDebugLoc() != Empty)
1064     return I;
1065 
1066   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
1067     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
1068       if (OpInst->getDebugLoc() != Empty)
1069         return OpInst;
1070   }
1071 
1072   return I;
1073 }
1074 
1075 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1076   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1077     const DILocation *DIL = Inst->getDebugLoc();
1078     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1079         !isa<DbgInfoIntrinsic>(Inst)) {
1080       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1081       auto NewDIL =
1082           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1083       if (NewDIL)
1084         B.SetCurrentDebugLocation(NewDIL.getValue());
1085       else
1086         LLVM_DEBUG(dbgs()
1087                    << "Failed to create new discriminator: "
1088                    << DIL->getFilename() << " Line: " << DIL->getLine());
1089     }
1090     else
1091       B.SetCurrentDebugLocation(DIL);
1092   } else
1093     B.SetCurrentDebugLocation(DebugLoc());
1094 }
1095 
1096 /// Write a record \p DebugMsg about vectorization failure to the debug
1097 /// output stream. If \p I is passed, it is an instruction that prevents
1098 /// vectorization.
1099 #ifndef NDEBUG
1100 static void debugVectorizationFailure(const StringRef DebugMsg,
1101     Instruction *I) {
1102   dbgs() << "LV: Not vectorizing: " << DebugMsg;
1103   if (I != nullptr)
1104     dbgs() << " " << *I;
1105   else
1106     dbgs() << '.';
1107   dbgs() << '\n';
1108 }
1109 #endif
1110 
1111 /// Create an analysis remark that explains why vectorization failed
1112 ///
1113 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1114 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1115 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1116 /// the location of the remark.  \return the remark object that can be
1117 /// streamed to.
1118 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1119     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1120   Value *CodeRegion = TheLoop->getHeader();
1121   DebugLoc DL = TheLoop->getStartLoc();
1122 
1123   if (I) {
1124     CodeRegion = I->getParent();
1125     // If there is no debug location attached to the instruction, revert back to
1126     // using the loop's.
1127     if (I->getDebugLoc())
1128       DL = I->getDebugLoc();
1129   }
1130 
1131   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1132   R << "loop not vectorized: ";
1133   return R;
1134 }
1135 
1136 /// Return a value for Step multiplied by VF.
1137 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1138   assert(isa<ConstantInt>(Step) && "Expected an integer step");
1139   Constant *StepVal = ConstantInt::get(
1140       Step->getType(),
1141       cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1142   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1143 }
1144 
1145 namespace llvm {
1146 
1147 void reportVectorizationFailure(const StringRef DebugMsg,
1148     const StringRef OREMsg, const StringRef ORETag,
1149     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1150   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
1151   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1152   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1153                 ORETag, TheLoop, I) << OREMsg);
1154 }
1155 
1156 } // end namespace llvm
1157 
1158 #ifndef NDEBUG
1159 /// \return string containing a file name and a line # for the given loop.
1160 static std::string getDebugLocString(const Loop *L) {
1161   std::string Result;
1162   if (L) {
1163     raw_string_ostream OS(Result);
1164     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1165       LoopDbgLoc.print(OS);
1166     else
1167       // Just print the module name.
1168       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1169     OS.flush();
1170   }
1171   return Result;
1172 }
1173 #endif
1174 
1175 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1176                                          const Instruction *Orig) {
1177   // If the loop was versioned with memchecks, add the corresponding no-alias
1178   // metadata.
1179   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1180     LVer->annotateInstWithNoAlias(To, Orig);
1181 }
1182 
1183 void InnerLoopVectorizer::addMetadata(Instruction *To,
1184                                       Instruction *From) {
1185   propagateMetadata(To, From);
1186   addNewMetadata(To, From);
1187 }
1188 
1189 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1190                                       Instruction *From) {
1191   for (Value *V : To) {
1192     if (Instruction *I = dyn_cast<Instruction>(V))
1193       addMetadata(I, From);
1194   }
1195 }
1196 
1197 namespace llvm {
1198 
1199 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1200 // lowered.
1201 enum ScalarEpilogueLowering {
1202 
1203   // The default: allowing scalar epilogues.
1204   CM_ScalarEpilogueAllowed,
1205 
1206   // Vectorization with OptForSize: don't allow epilogues.
1207   CM_ScalarEpilogueNotAllowedOptSize,
1208 
1209   // A special case of vectorisation with OptForSize: loops with a very small
1210   // trip count are considered for vectorization under OptForSize, thereby
1211   // making sure the cost of their loop body is dominant, free of runtime
1212   // guards and scalar iteration overheads.
1213   CM_ScalarEpilogueNotAllowedLowTripLoop,
1214 
1215   // Loop hint predicate indicating an epilogue is undesired.
1216   CM_ScalarEpilogueNotNeededUsePredicate,
1217 
1218   // Directive indicating we must either tail fold or not vectorize
1219   CM_ScalarEpilogueNotAllowedUsePredicate
1220 };
1221 
1222 /// LoopVectorizationCostModel - estimates the expected speedups due to
1223 /// vectorization.
1224 /// In many cases vectorization is not profitable. This can happen because of
1225 /// a number of reasons. In this class we mainly attempt to predict the
1226 /// expected speedup/slowdowns due to the supported instruction set. We use the
1227 /// TargetTransformInfo to query the different backends for the cost of
1228 /// different operations.
1229 class LoopVectorizationCostModel {
1230 public:
1231   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1232                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1233                              LoopVectorizationLegality *Legal,
1234                              const TargetTransformInfo &TTI,
1235                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1236                              AssumptionCache *AC,
1237                              OptimizationRemarkEmitter *ORE, const Function *F,
1238                              const LoopVectorizeHints *Hints,
1239                              InterleavedAccessInfo &IAI)
1240       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1241         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1242         Hints(Hints), InterleaveInfo(IAI) {}
1243 
1244   /// \return An upper bound for the vectorization factor, or None if
1245   /// vectorization and interleaving should be avoided up front.
1246   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1247 
1248   /// \return True if runtime checks are required for vectorization, and false
1249   /// otherwise.
1250   bool runtimeChecksRequired();
1251 
1252   /// \return The most profitable vectorization factor and the cost of that VF.
1253   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1254   /// then this vectorization factor will be selected if vectorization is
1255   /// possible.
1256   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1257   VectorizationFactor
1258   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1259                                     const LoopVectorizationPlanner &LVP);
1260 
1261   /// Setup cost-based decisions for user vectorization factor.
1262   void selectUserVectorizationFactor(ElementCount UserVF) {
1263     collectUniformsAndScalars(UserVF);
1264     collectInstsToScalarize(UserVF);
1265   }
1266 
1267   /// \return The size (in bits) of the smallest and widest types in the code
1268   /// that needs to be vectorized. We ignore values that remain scalar such as
1269   /// 64 bit loop indices.
1270   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1271 
1272   /// \return The desired interleave count.
1273   /// If interleave count has been specified by metadata it will be returned.
1274   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1275   /// are the selected vectorization factor and the cost of the selected VF.
1276   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1277 
1278   /// Memory access instruction may be vectorized in more than one way.
1279   /// Form of instruction after vectorization depends on cost.
1280   /// This function takes cost-based decisions for Load/Store instructions
1281   /// and collects them in a map. This decisions map is used for building
1282   /// the lists of loop-uniform and loop-scalar instructions.
1283   /// The calculated cost is saved with widening decision in order to
1284   /// avoid redundant calculations.
1285   void setCostBasedWideningDecision(ElementCount VF);
1286 
1287   /// A struct that represents some properties of the register usage
1288   /// of a loop.
1289   struct RegisterUsage {
1290     /// Holds the number of loop invariant values that are used in the loop.
1291     /// The key is ClassID of target-provided register class.
1292     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1293     /// Holds the maximum number of concurrent live intervals in the loop.
1294     /// The key is ClassID of target-provided register class.
1295     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1296   };
1297 
1298   /// \return Returns information about the register usages of the loop for the
1299   /// given vectorization factors.
1300   SmallVector<RegisterUsage, 8>
1301   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1302 
1303   /// Collect values we want to ignore in the cost model.
1304   void collectValuesToIgnore();
1305 
1306   /// Split reductions into those that happen in the loop, and those that happen
1307   /// outside. In loop reductions are collected into InLoopReductionChains.
1308   void collectInLoopReductions();
1309 
1310   /// \returns The smallest bitwidth each instruction can be represented with.
1311   /// The vector equivalents of these instructions should be truncated to this
1312   /// type.
1313   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1314     return MinBWs;
1315   }
1316 
1317   /// \returns True if it is more profitable to scalarize instruction \p I for
1318   /// vectorization factor \p VF.
1319   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1320     assert(VF.isVector() &&
1321            "Profitable to scalarize relevant only for VF > 1.");
1322 
1323     // Cost model is not run in the VPlan-native path - return conservative
1324     // result until this changes.
1325     if (EnableVPlanNativePath)
1326       return false;
1327 
1328     auto Scalars = InstsToScalarize.find(VF);
1329     assert(Scalars != InstsToScalarize.end() &&
1330            "VF not yet analyzed for scalarization profitability");
1331     return Scalars->second.find(I) != Scalars->second.end();
1332   }
1333 
1334   /// Returns true if \p I is known to be uniform after vectorization.
1335   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1336     if (VF.isScalar())
1337       return true;
1338 
1339     // Cost model is not run in the VPlan-native path - return conservative
1340     // result until this changes.
1341     if (EnableVPlanNativePath)
1342       return false;
1343 
1344     auto UniformsPerVF = Uniforms.find(VF);
1345     assert(UniformsPerVF != Uniforms.end() &&
1346            "VF not yet analyzed for uniformity");
1347     return UniformsPerVF->second.count(I);
1348   }
1349 
1350   /// Returns true if \p I is known to be scalar after vectorization.
1351   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1352     if (VF.isScalar())
1353       return true;
1354 
1355     // Cost model is not run in the VPlan-native path - return conservative
1356     // result until this changes.
1357     if (EnableVPlanNativePath)
1358       return false;
1359 
1360     auto ScalarsPerVF = Scalars.find(VF);
1361     assert(ScalarsPerVF != Scalars.end() &&
1362            "Scalar values are not calculated for VF");
1363     return ScalarsPerVF->second.count(I);
1364   }
1365 
1366   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1367   /// for vectorization factor \p VF.
1368   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1369     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1370            !isProfitableToScalarize(I, VF) &&
1371            !isScalarAfterVectorization(I, VF);
1372   }
1373 
1374   /// Decision that was taken during cost calculation for memory instruction.
1375   enum InstWidening {
1376     CM_Unknown,
1377     CM_Widen,         // For consecutive accesses with stride +1.
1378     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1379     CM_Interleave,
1380     CM_GatherScatter,
1381     CM_Scalarize
1382   };
1383 
1384   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1385   /// instruction \p I and vector width \p VF.
1386   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1387                            unsigned Cost) {
1388     assert(VF.isVector() && "Expected VF >=2");
1389     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1390   }
1391 
1392   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1393   /// interleaving group \p Grp and vector width \p VF.
1394   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1395                            ElementCount VF, InstWidening W, unsigned Cost) {
1396     assert(VF.isVector() && "Expected VF >=2");
1397     /// Broadcast this decicion to all instructions inside the group.
1398     /// But the cost will be assigned to one instruction only.
1399     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1400       if (auto *I = Grp->getMember(i)) {
1401         if (Grp->getInsertPos() == I)
1402           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1403         else
1404           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1405       }
1406     }
1407   }
1408 
1409   /// Return the cost model decision for the given instruction \p I and vector
1410   /// width \p VF. Return CM_Unknown if this instruction did not pass
1411   /// through the cost modeling.
1412   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1413     assert(VF.isVector() && "Expected VF to be a vector VF");
1414     // Cost model is not run in the VPlan-native path - return conservative
1415     // result until this changes.
1416     if (EnableVPlanNativePath)
1417       return CM_GatherScatter;
1418 
1419     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1420     auto Itr = WideningDecisions.find(InstOnVF);
1421     if (Itr == WideningDecisions.end())
1422       return CM_Unknown;
1423     return Itr->second.first;
1424   }
1425 
1426   /// Return the vectorization cost for the given instruction \p I and vector
1427   /// width \p VF.
1428   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1429     assert(VF.isVector() && "Expected VF >=2");
1430     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1431     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1432            "The cost is not calculated");
1433     return WideningDecisions[InstOnVF].second;
1434   }
1435 
1436   /// Return True if instruction \p I is an optimizable truncate whose operand
1437   /// is an induction variable. Such a truncate will be removed by adding a new
1438   /// induction variable with the destination type.
1439   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1440     // If the instruction is not a truncate, return false.
1441     auto *Trunc = dyn_cast<TruncInst>(I);
1442     if (!Trunc)
1443       return false;
1444 
1445     // Get the source and destination types of the truncate.
1446     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1447     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1448 
1449     // If the truncate is free for the given types, return false. Replacing a
1450     // free truncate with an induction variable would add an induction variable
1451     // update instruction to each iteration of the loop. We exclude from this
1452     // check the primary induction variable since it will need an update
1453     // instruction regardless.
1454     Value *Op = Trunc->getOperand(0);
1455     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1456       return false;
1457 
1458     // If the truncated value is not an induction variable, return false.
1459     return Legal->isInductionPhi(Op);
1460   }
1461 
1462   /// Collects the instructions to scalarize for each predicated instruction in
1463   /// the loop.
1464   void collectInstsToScalarize(ElementCount VF);
1465 
1466   /// Collect Uniform and Scalar values for the given \p VF.
1467   /// The sets depend on CM decision for Load/Store instructions
1468   /// that may be vectorized as interleave, gather-scatter or scalarized.
1469   void collectUniformsAndScalars(ElementCount VF) {
1470     // Do the analysis once.
1471     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1472       return;
1473     setCostBasedWideningDecision(VF);
1474     collectLoopUniforms(VF);
1475     collectLoopScalars(VF);
1476   }
1477 
1478   /// Returns true if the target machine supports masked store operation
1479   /// for the given \p DataType and kind of access to \p Ptr.
1480   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1481     return Legal->isConsecutivePtr(Ptr) &&
1482            TTI.isLegalMaskedStore(DataType, Alignment);
1483   }
1484 
1485   /// Returns true if the target machine supports masked load operation
1486   /// for the given \p DataType and kind of access to \p Ptr.
1487   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1488     return Legal->isConsecutivePtr(Ptr) &&
1489            TTI.isLegalMaskedLoad(DataType, Alignment);
1490   }
1491 
1492   /// Returns true if the target machine supports masked scatter operation
1493   /// for the given \p DataType.
1494   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1495     return TTI.isLegalMaskedScatter(DataType, Alignment);
1496   }
1497 
1498   /// Returns true if the target machine supports masked gather operation
1499   /// for the given \p DataType.
1500   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1501     return TTI.isLegalMaskedGather(DataType, Alignment);
1502   }
1503 
1504   /// Returns true if the target machine can represent \p V as a masked gather
1505   /// or scatter operation.
1506   bool isLegalGatherOrScatter(Value *V) {
1507     bool LI = isa<LoadInst>(V);
1508     bool SI = isa<StoreInst>(V);
1509     if (!LI && !SI)
1510       return false;
1511     auto *Ty = getMemInstValueType(V);
1512     Align Align = getLoadStoreAlignment(V);
1513     return (LI && isLegalMaskedGather(Ty, Align)) ||
1514            (SI && isLegalMaskedScatter(Ty, Align));
1515   }
1516 
1517   /// Returns true if \p I is an instruction that will be scalarized with
1518   /// predication. Such instructions include conditional stores and
1519   /// instructions that may divide by zero.
1520   /// If a non-zero VF has been calculated, we check if I will be scalarized
1521   /// predication for that VF.
1522   bool isScalarWithPredication(Instruction *I,
1523                                ElementCount VF = ElementCount::getFixed(1));
1524 
1525   // Returns true if \p I is an instruction that will be predicated either
1526   // through scalar predication or masked load/store or masked gather/scatter.
1527   // Superset of instructions that return true for isScalarWithPredication.
1528   bool isPredicatedInst(Instruction *I) {
1529     if (!blockNeedsPredication(I->getParent()))
1530       return false;
1531     // Loads and stores that need some form of masked operation are predicated
1532     // instructions.
1533     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1534       return Legal->isMaskRequired(I);
1535     return isScalarWithPredication(I);
1536   }
1537 
1538   /// Returns true if \p I is a memory instruction with consecutive memory
1539   /// access that can be widened.
1540   bool
1541   memoryInstructionCanBeWidened(Instruction *I,
1542                                 ElementCount VF = ElementCount::getFixed(1));
1543 
1544   /// Returns true if \p I is a memory instruction in an interleaved-group
1545   /// of memory accesses that can be vectorized with wide vector loads/stores
1546   /// and shuffles.
1547   bool
1548   interleavedAccessCanBeWidened(Instruction *I,
1549                                 ElementCount VF = ElementCount::getFixed(1));
1550 
1551   /// Check if \p Instr belongs to any interleaved access group.
1552   bool isAccessInterleaved(Instruction *Instr) {
1553     return InterleaveInfo.isInterleaved(Instr);
1554   }
1555 
1556   /// Get the interleaved access group that \p Instr belongs to.
1557   const InterleaveGroup<Instruction> *
1558   getInterleavedAccessGroup(Instruction *Instr) {
1559     return InterleaveInfo.getInterleaveGroup(Instr);
1560   }
1561 
1562   /// Returns true if we're required to use a scalar epilogue for at least
1563   /// the final iteration of the original loop.
1564   bool requiresScalarEpilogue() const {
1565     if (!isScalarEpilogueAllowed())
1566       return false;
1567     // If we might exit from anywhere but the latch, must run the exiting
1568     // iteration in scalar form.
1569     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1570       return true;
1571     return InterleaveInfo.requiresScalarEpilogue();
1572   }
1573 
1574   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1575   /// loop hint annotation.
1576   bool isScalarEpilogueAllowed() const {
1577     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1578   }
1579 
1580   /// Returns true if all loop blocks should be masked to fold tail loop.
1581   bool foldTailByMasking() const { return FoldTailByMasking; }
1582 
1583   bool blockNeedsPredication(BasicBlock *BB) {
1584     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1585   }
1586 
1587   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1588   /// nodes to the chain of instructions representing the reductions. Uses a
1589   /// MapVector to ensure deterministic iteration order.
1590   using ReductionChainMap =
1591       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1592 
1593   /// Return the chain of instructions representing an inloop reduction.
1594   const ReductionChainMap &getInLoopReductionChains() const {
1595     return InLoopReductionChains;
1596   }
1597 
1598   /// Returns true if the Phi is part of an inloop reduction.
1599   bool isInLoopReduction(PHINode *Phi) const {
1600     return InLoopReductionChains.count(Phi);
1601   }
1602 
1603   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1604   /// with factor VF.  Return the cost of the instruction, including
1605   /// scalarization overhead if it's needed.
1606   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1607 
1608   /// Estimate cost of a call instruction CI if it were vectorized with factor
1609   /// VF. Return the cost of the instruction, including scalarization overhead
1610   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1611   /// scalarized -
1612   /// i.e. either vector version isn't available, or is too expensive.
1613   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1614                              bool &NeedToScalarize);
1615 
1616   /// Invalidates decisions already taken by the cost model.
1617   void invalidateCostModelingDecisions() {
1618     WideningDecisions.clear();
1619     Uniforms.clear();
1620     Scalars.clear();
1621   }
1622 
1623 private:
1624   unsigned NumPredStores = 0;
1625 
1626   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1627   /// than zero. One is returned if vectorization should best be avoided due
1628   /// to cost.
1629   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1630                                     ElementCount UserVF);
1631 
1632   /// The vectorization cost is a combination of the cost itself and a boolean
1633   /// indicating whether any of the contributing operations will actually
1634   /// operate on
1635   /// vector values after type legalization in the backend. If this latter value
1636   /// is
1637   /// false, then all operations will be scalarized (i.e. no vectorization has
1638   /// actually taken place).
1639   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1640 
1641   /// Returns the expected execution cost. The unit of the cost does
1642   /// not matter because we use the 'cost' units to compare different
1643   /// vector widths. The cost that is returned is *not* normalized by
1644   /// the factor width.
1645   VectorizationCostTy expectedCost(ElementCount VF);
1646 
1647   /// Returns the execution time cost of an instruction for a given vector
1648   /// width. Vector width of one means scalar.
1649   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1650 
1651   /// The cost-computation logic from getInstructionCost which provides
1652   /// the vector type as an output parameter.
1653   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1654                                      Type *&VectorTy);
1655 
1656   /// Calculate vectorization cost of memory instruction \p I.
1657   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1658 
1659   /// The cost computation for scalarized memory instruction.
1660   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1661 
1662   /// The cost computation for interleaving group of memory instructions.
1663   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1664 
1665   /// The cost computation for Gather/Scatter instruction.
1666   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1667 
1668   /// The cost computation for widening instruction \p I with consecutive
1669   /// memory access.
1670   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1671 
1672   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1673   /// Load: scalar load + broadcast.
1674   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1675   /// element)
1676   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1677 
1678   /// Estimate the overhead of scalarizing an instruction. This is a
1679   /// convenience wrapper for the type-based getScalarizationOverhead API.
1680   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1681 
1682   /// Returns whether the instruction is a load or store and will be a emitted
1683   /// as a vector operation.
1684   bool isConsecutiveLoadOrStore(Instruction *I);
1685 
1686   /// Returns true if an artificially high cost for emulated masked memrefs
1687   /// should be used.
1688   bool useEmulatedMaskMemRefHack(Instruction *I);
1689 
1690   /// Map of scalar integer values to the smallest bitwidth they can be legally
1691   /// represented as. The vector equivalents of these values should be truncated
1692   /// to this type.
1693   MapVector<Instruction *, uint64_t> MinBWs;
1694 
1695   /// A type representing the costs for instructions if they were to be
1696   /// scalarized rather than vectorized. The entries are Instruction-Cost
1697   /// pairs.
1698   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1699 
1700   /// A set containing all BasicBlocks that are known to present after
1701   /// vectorization as a predicated block.
1702   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1703 
1704   /// Records whether it is allowed to have the original scalar loop execute at
1705   /// least once. This may be needed as a fallback loop in case runtime
1706   /// aliasing/dependence checks fail, or to handle the tail/remainder
1707   /// iterations when the trip count is unknown or doesn't divide by the VF,
1708   /// or as a peel-loop to handle gaps in interleave-groups.
1709   /// Under optsize and when the trip count is very small we don't allow any
1710   /// iterations to execute in the scalar loop.
1711   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1712 
1713   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1714   bool FoldTailByMasking = false;
1715 
1716   /// A map holding scalar costs for different vectorization factors. The
1717   /// presence of a cost for an instruction in the mapping indicates that the
1718   /// instruction will be scalarized when vectorizing with the associated
1719   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1720   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1721 
1722   /// Holds the instructions known to be uniform after vectorization.
1723   /// The data is collected per VF.
1724   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1725 
1726   /// Holds the instructions known to be scalar after vectorization.
1727   /// The data is collected per VF.
1728   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1729 
1730   /// Holds the instructions (address computations) that are forced to be
1731   /// scalarized.
1732   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1733 
1734   /// PHINodes of the reductions that should be expanded in-loop along with
1735   /// their associated chains of reduction operations, in program order from top
1736   /// (PHI) to bottom
1737   ReductionChainMap InLoopReductionChains;
1738 
1739   /// Returns the expected difference in cost from scalarizing the expression
1740   /// feeding a predicated instruction \p PredInst. The instructions to
1741   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1742   /// non-negative return value implies the expression will be scalarized.
1743   /// Currently, only single-use chains are considered for scalarization.
1744   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1745                               ElementCount VF);
1746 
1747   /// Collect the instructions that are uniform after vectorization. An
1748   /// instruction is uniform if we represent it with a single scalar value in
1749   /// the vectorized loop corresponding to each vector iteration. Examples of
1750   /// uniform instructions include pointer operands of consecutive or
1751   /// interleaved memory accesses. Note that although uniformity implies an
1752   /// instruction will be scalar, the reverse is not true. In general, a
1753   /// scalarized instruction will be represented by VF scalar values in the
1754   /// vectorized loop, each corresponding to an iteration of the original
1755   /// scalar loop.
1756   void collectLoopUniforms(ElementCount VF);
1757 
1758   /// Collect the instructions that are scalar after vectorization. An
1759   /// instruction is scalar if it is known to be uniform or will be scalarized
1760   /// during vectorization. Non-uniform scalarized instructions will be
1761   /// represented by VF values in the vectorized loop, each corresponding to an
1762   /// iteration of the original scalar loop.
1763   void collectLoopScalars(ElementCount VF);
1764 
1765   /// Keeps cost model vectorization decision and cost for instructions.
1766   /// Right now it is used for memory instructions only.
1767   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1768                                 std::pair<InstWidening, unsigned>>;
1769 
1770   DecisionList WideningDecisions;
1771 
1772   /// Returns true if \p V is expected to be vectorized and it needs to be
1773   /// extracted.
1774   bool needsExtract(Value *V, ElementCount VF) const {
1775     Instruction *I = dyn_cast<Instruction>(V);
1776     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1777         TheLoop->isLoopInvariant(I))
1778       return false;
1779 
1780     // Assume we can vectorize V (and hence we need extraction) if the
1781     // scalars are not computed yet. This can happen, because it is called
1782     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1783     // the scalars are collected. That should be a safe assumption in most
1784     // cases, because we check if the operands have vectorizable types
1785     // beforehand in LoopVectorizationLegality.
1786     return Scalars.find(VF) == Scalars.end() ||
1787            !isScalarAfterVectorization(I, VF);
1788   };
1789 
1790   /// Returns a range containing only operands needing to be extracted.
1791   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1792                                                    ElementCount VF) {
1793     return SmallVector<Value *, 4>(make_filter_range(
1794         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1795   }
1796 
1797   /// Determines if we have the infrastructure to vectorize loop \p L and its
1798   /// epilogue, assuming the main loop is vectorized by \p VF.
1799   bool isCandidateForEpilogueVectorization(const Loop &L,
1800                                            const ElementCount VF) const;
1801 
1802   /// Returns true if epilogue vectorization is considered profitable, and
1803   /// false otherwise.
1804   /// \p VF is the vectorization factor chosen for the original loop.
1805   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1806 
1807 public:
1808   /// The loop that we evaluate.
1809   Loop *TheLoop;
1810 
1811   /// Predicated scalar evolution analysis.
1812   PredicatedScalarEvolution &PSE;
1813 
1814   /// Loop Info analysis.
1815   LoopInfo *LI;
1816 
1817   /// Vectorization legality.
1818   LoopVectorizationLegality *Legal;
1819 
1820   /// Vector target information.
1821   const TargetTransformInfo &TTI;
1822 
1823   /// Target Library Info.
1824   const TargetLibraryInfo *TLI;
1825 
1826   /// Demanded bits analysis.
1827   DemandedBits *DB;
1828 
1829   /// Assumption cache.
1830   AssumptionCache *AC;
1831 
1832   /// Interface to emit optimization remarks.
1833   OptimizationRemarkEmitter *ORE;
1834 
1835   const Function *TheFunction;
1836 
1837   /// Loop Vectorize Hint.
1838   const LoopVectorizeHints *Hints;
1839 
1840   /// The interleave access information contains groups of interleaved accesses
1841   /// with the same stride and close to each other.
1842   InterleavedAccessInfo &InterleaveInfo;
1843 
1844   /// Values to ignore in the cost model.
1845   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1846 
1847   /// Values to ignore in the cost model when VF > 1.
1848   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1849 
1850   /// Profitable vector factors.
1851   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1852 };
1853 
1854 } // end namespace llvm
1855 
1856 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1857 // vectorization. The loop needs to be annotated with #pragma omp simd
1858 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1859 // vector length information is not provided, vectorization is not considered
1860 // explicit. Interleave hints are not allowed either. These limitations will be
1861 // relaxed in the future.
1862 // Please, note that we are currently forced to abuse the pragma 'clang
1863 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1864 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1865 // provides *explicit vectorization hints* (LV can bypass legal checks and
1866 // assume that vectorization is legal). However, both hints are implemented
1867 // using the same metadata (llvm.loop.vectorize, processed by
1868 // LoopVectorizeHints). This will be fixed in the future when the native IR
1869 // representation for pragma 'omp simd' is introduced.
1870 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1871                                    OptimizationRemarkEmitter *ORE) {
1872   assert(!OuterLp->isInnermost() && "This is not an outer loop");
1873   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1874 
1875   // Only outer loops with an explicit vectorization hint are supported.
1876   // Unannotated outer loops are ignored.
1877   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1878     return false;
1879 
1880   Function *Fn = OuterLp->getHeader()->getParent();
1881   if (!Hints.allowVectorization(Fn, OuterLp,
1882                                 true /*VectorizeOnlyWhenForced*/)) {
1883     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1884     return false;
1885   }
1886 
1887   if (Hints.getInterleave() > 1) {
1888     // TODO: Interleave support is future work.
1889     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1890                          "outer loops.\n");
1891     Hints.emitRemarkWithHints();
1892     return false;
1893   }
1894 
1895   return true;
1896 }
1897 
1898 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1899                                   OptimizationRemarkEmitter *ORE,
1900                                   SmallVectorImpl<Loop *> &V) {
1901   // Collect inner loops and outer loops without irreducible control flow. For
1902   // now, only collect outer loops that have explicit vectorization hints. If we
1903   // are stress testing the VPlan H-CFG construction, we collect the outermost
1904   // loop of every loop nest.
1905   if (L.isInnermost() || VPlanBuildStressTest ||
1906       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1907     LoopBlocksRPO RPOT(&L);
1908     RPOT.perform(LI);
1909     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1910       V.push_back(&L);
1911       // TODO: Collect inner loops inside marked outer loops in case
1912       // vectorization fails for the outer loop. Do not invoke
1913       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1914       // already known to be reducible. We can use an inherited attribute for
1915       // that.
1916       return;
1917     }
1918   }
1919   for (Loop *InnerL : L)
1920     collectSupportedLoops(*InnerL, LI, ORE, V);
1921 }
1922 
1923 namespace {
1924 
1925 /// The LoopVectorize Pass.
1926 struct LoopVectorize : public FunctionPass {
1927   /// Pass identification, replacement for typeid
1928   static char ID;
1929 
1930   LoopVectorizePass Impl;
1931 
1932   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1933                          bool VectorizeOnlyWhenForced = false)
1934       : FunctionPass(ID),
1935         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1936     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1937   }
1938 
1939   bool runOnFunction(Function &F) override {
1940     if (skipFunction(F))
1941       return false;
1942 
1943     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1944     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1945     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1946     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1947     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1948     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1949     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1950     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1951     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1952     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1953     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1954     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1955     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1956 
1957     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1958         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1959 
1960     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1961                         GetLAA, *ORE, PSI).MadeAnyChange;
1962   }
1963 
1964   void getAnalysisUsage(AnalysisUsage &AU) const override {
1965     AU.addRequired<AssumptionCacheTracker>();
1966     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1967     AU.addRequired<DominatorTreeWrapperPass>();
1968     AU.addRequired<LoopInfoWrapperPass>();
1969     AU.addRequired<ScalarEvolutionWrapperPass>();
1970     AU.addRequired<TargetTransformInfoWrapperPass>();
1971     AU.addRequired<AAResultsWrapperPass>();
1972     AU.addRequired<LoopAccessLegacyAnalysis>();
1973     AU.addRequired<DemandedBitsWrapperPass>();
1974     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1975     AU.addRequired<InjectTLIMappingsLegacy>();
1976 
1977     // We currently do not preserve loopinfo/dominator analyses with outer loop
1978     // vectorization. Until this is addressed, mark these analyses as preserved
1979     // only for non-VPlan-native path.
1980     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1981     if (!EnableVPlanNativePath) {
1982       AU.addPreserved<LoopInfoWrapperPass>();
1983       AU.addPreserved<DominatorTreeWrapperPass>();
1984     }
1985 
1986     AU.addPreserved<BasicAAWrapperPass>();
1987     AU.addPreserved<GlobalsAAWrapperPass>();
1988     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1989   }
1990 };
1991 
1992 } // end anonymous namespace
1993 
1994 //===----------------------------------------------------------------------===//
1995 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1996 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1997 //===----------------------------------------------------------------------===//
1998 
1999 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2000   // We need to place the broadcast of invariant variables outside the loop,
2001   // but only if it's proven safe to do so. Else, broadcast will be inside
2002   // vector loop body.
2003   Instruction *Instr = dyn_cast<Instruction>(V);
2004   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2005                      (!Instr ||
2006                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2007   // Place the code for broadcasting invariant variables in the new preheader.
2008   IRBuilder<>::InsertPointGuard Guard(Builder);
2009   if (SafeToHoist)
2010     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2011 
2012   // Broadcast the scalar into all locations in the vector.
2013   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2014 
2015   return Shuf;
2016 }
2017 
2018 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2019     const InductionDescriptor &II, Value *Step, Value *Start,
2020     Instruction *EntryVal) {
2021   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2022          "Expected either an induction phi-node or a truncate of it!");
2023 
2024   // Construct the initial value of the vector IV in the vector loop preheader
2025   auto CurrIP = Builder.saveIP();
2026   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2027   if (isa<TruncInst>(EntryVal)) {
2028     assert(Start->getType()->isIntegerTy() &&
2029            "Truncation requires an integer type");
2030     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2031     Step = Builder.CreateTrunc(Step, TruncType);
2032     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2033   }
2034   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2035   Value *SteppedStart =
2036       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2037 
2038   // We create vector phi nodes for both integer and floating-point induction
2039   // variables. Here, we determine the kind of arithmetic we will perform.
2040   Instruction::BinaryOps AddOp;
2041   Instruction::BinaryOps MulOp;
2042   if (Step->getType()->isIntegerTy()) {
2043     AddOp = Instruction::Add;
2044     MulOp = Instruction::Mul;
2045   } else {
2046     AddOp = II.getInductionOpcode();
2047     MulOp = Instruction::FMul;
2048   }
2049 
2050   // Multiply the vectorization factor by the step using integer or
2051   // floating-point arithmetic as appropriate.
2052   Value *ConstVF =
2053       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
2054   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2055 
2056   // Create a vector splat to use in the induction update.
2057   //
2058   // FIXME: If the step is non-constant, we create the vector splat with
2059   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2060   //        handle a constant vector splat.
2061   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2062   Value *SplatVF = isa<Constant>(Mul)
2063                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2064                        : Builder.CreateVectorSplat(VF, Mul);
2065   Builder.restoreIP(CurrIP);
2066 
2067   // We may need to add the step a number of times, depending on the unroll
2068   // factor. The last of those goes into the PHI.
2069   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2070                                     &*LoopVectorBody->getFirstInsertionPt());
2071   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2072   Instruction *LastInduction = VecInd;
2073   for (unsigned Part = 0; Part < UF; ++Part) {
2074     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2075 
2076     if (isa<TruncInst>(EntryVal))
2077       addMetadata(LastInduction, EntryVal);
2078     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
2079 
2080     LastInduction = cast<Instruction>(addFastMathFlag(
2081         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2082     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2083   }
2084 
2085   // Move the last step to the end of the latch block. This ensures consistent
2086   // placement of all induction updates.
2087   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2088   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2089   auto *ICmp = cast<Instruction>(Br->getCondition());
2090   LastInduction->moveBefore(ICmp);
2091   LastInduction->setName("vec.ind.next");
2092 
2093   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2094   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2095 }
2096 
2097 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2098   return Cost->isScalarAfterVectorization(I, VF) ||
2099          Cost->isProfitableToScalarize(I, VF);
2100 }
2101 
2102 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2103   if (shouldScalarizeInstruction(IV))
2104     return true;
2105   auto isScalarInst = [&](User *U) -> bool {
2106     auto *I = cast<Instruction>(U);
2107     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2108   };
2109   return llvm::any_of(IV->users(), isScalarInst);
2110 }
2111 
2112 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2113     const InductionDescriptor &ID, const Instruction *EntryVal,
2114     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
2115   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2116          "Expected either an induction phi-node or a truncate of it!");
2117 
2118   // This induction variable is not the phi from the original loop but the
2119   // newly-created IV based on the proof that casted Phi is equal to the
2120   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2121   // re-uses the same InductionDescriptor that original IV uses but we don't
2122   // have to do any recording in this case - that is done when original IV is
2123   // processed.
2124   if (isa<TruncInst>(EntryVal))
2125     return;
2126 
2127   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2128   if (Casts.empty())
2129     return;
2130   // Only the first Cast instruction in the Casts vector is of interest.
2131   // The rest of the Casts (if exist) have no uses outside the
2132   // induction update chain itself.
2133   Instruction *CastInst = *Casts.begin();
2134   if (Lane < UINT_MAX)
2135     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
2136   else
2137     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
2138 }
2139 
2140 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2141                                                 TruncInst *Trunc) {
2142   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2143          "Primary induction variable must have an integer type");
2144 
2145   auto II = Legal->getInductionVars().find(IV);
2146   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2147 
2148   auto ID = II->second;
2149   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2150 
2151   // The value from the original loop to which we are mapping the new induction
2152   // variable.
2153   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2154 
2155   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2156 
2157   // Generate code for the induction step. Note that induction steps are
2158   // required to be loop-invariant
2159   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2160     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2161            "Induction step should be loop invariant");
2162     if (PSE.getSE()->isSCEVable(IV->getType())) {
2163       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2164       return Exp.expandCodeFor(Step, Step->getType(),
2165                                LoopVectorPreHeader->getTerminator());
2166     }
2167     return cast<SCEVUnknown>(Step)->getValue();
2168   };
2169 
2170   // The scalar value to broadcast. This is derived from the canonical
2171   // induction variable. If a truncation type is given, truncate the canonical
2172   // induction variable and step. Otherwise, derive these values from the
2173   // induction descriptor.
2174   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2175     Value *ScalarIV = Induction;
2176     if (IV != OldInduction) {
2177       ScalarIV = IV->getType()->isIntegerTy()
2178                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2179                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2180                                           IV->getType());
2181       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2182       ScalarIV->setName("offset.idx");
2183     }
2184     if (Trunc) {
2185       auto *TruncType = cast<IntegerType>(Trunc->getType());
2186       assert(Step->getType()->isIntegerTy() &&
2187              "Truncation requires an integer step");
2188       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2189       Step = Builder.CreateTrunc(Step, TruncType);
2190     }
2191     return ScalarIV;
2192   };
2193 
2194   // Create the vector values from the scalar IV, in the absence of creating a
2195   // vector IV.
2196   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2197     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2198     for (unsigned Part = 0; Part < UF; ++Part) {
2199       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2200       Value *EntryPart =
2201           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2202                         ID.getInductionOpcode());
2203       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2204       if (Trunc)
2205         addMetadata(EntryPart, Trunc);
2206       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2207     }
2208   };
2209 
2210   // Now do the actual transformations, and start with creating the step value.
2211   Value *Step = CreateStepValue(ID.getStep());
2212   if (VF.isZero() || VF.isScalar()) {
2213     Value *ScalarIV = CreateScalarIV(Step);
2214     CreateSplatIV(ScalarIV, Step);
2215     return;
2216   }
2217 
2218   // Determine if we want a scalar version of the induction variable. This is
2219   // true if the induction variable itself is not widened, or if it has at
2220   // least one user in the loop that is not widened.
2221   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2222   if (!NeedsScalarIV) {
2223     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
2224     return;
2225   }
2226 
2227   // Try to create a new independent vector induction variable. If we can't
2228   // create the phi node, we will splat the scalar induction variable in each
2229   // loop iteration.
2230   if (!shouldScalarizeInstruction(EntryVal)) {
2231     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
2232     Value *ScalarIV = CreateScalarIV(Step);
2233     // Create scalar steps that can be used by instructions we will later
2234     // scalarize. Note that the addition of the scalar steps will not increase
2235     // the number of instructions in the loop in the common case prior to
2236     // InstCombine. We will be trading one vector extract for each scalar step.
2237     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2238     return;
2239   }
2240 
2241   // All IV users are scalar instructions, so only emit a scalar IV, not a
2242   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2243   // predicate used by the masked loads/stores.
2244   Value *ScalarIV = CreateScalarIV(Step);
2245   if (!Cost->isScalarEpilogueAllowed())
2246     CreateSplatIV(ScalarIV, Step);
2247   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2248 }
2249 
2250 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2251                                           Instruction::BinaryOps BinOp) {
2252   // Create and check the types.
2253   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2254   int VLen = ValVTy->getNumElements();
2255 
2256   Type *STy = Val->getType()->getScalarType();
2257   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2258          "Induction Step must be an integer or FP");
2259   assert(Step->getType() == STy && "Step has wrong type");
2260 
2261   SmallVector<Constant *, 8> Indices;
2262 
2263   if (STy->isIntegerTy()) {
2264     // Create a vector of consecutive numbers from zero to VF.
2265     for (int i = 0; i < VLen; ++i)
2266       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2267 
2268     // Add the consecutive indices to the vector value.
2269     Constant *Cv = ConstantVector::get(Indices);
2270     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2271     Step = Builder.CreateVectorSplat(VLen, Step);
2272     assert(Step->getType() == Val->getType() && "Invalid step vec");
2273     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2274     // which can be found from the original scalar operations.
2275     Step = Builder.CreateMul(Cv, Step);
2276     return Builder.CreateAdd(Val, Step, "induction");
2277   }
2278 
2279   // Floating point induction.
2280   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2281          "Binary Opcode should be specified for FP induction");
2282   // Create a vector of consecutive numbers from zero to VF.
2283   for (int i = 0; i < VLen; ++i)
2284     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2285 
2286   // Add the consecutive indices to the vector value.
2287   Constant *Cv = ConstantVector::get(Indices);
2288 
2289   Step = Builder.CreateVectorSplat(VLen, Step);
2290 
2291   // Floating point operations had to be 'fast' to enable the induction.
2292   FastMathFlags Flags;
2293   Flags.setFast();
2294 
2295   Value *MulOp = Builder.CreateFMul(Cv, Step);
2296   if (isa<Instruction>(MulOp))
2297     // Have to check, MulOp may be a constant
2298     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2299 
2300   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2301   if (isa<Instruction>(BOp))
2302     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2303   return BOp;
2304 }
2305 
2306 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2307                                            Instruction *EntryVal,
2308                                            const InductionDescriptor &ID) {
2309   // We shouldn't have to build scalar steps if we aren't vectorizing.
2310   assert(VF.isVector() && "VF should be greater than one");
2311   // Get the value type and ensure it and the step have the same integer type.
2312   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2313   assert(ScalarIVTy == Step->getType() &&
2314          "Val and Step should have the same type");
2315 
2316   // We build scalar steps for both integer and floating-point induction
2317   // variables. Here, we determine the kind of arithmetic we will perform.
2318   Instruction::BinaryOps AddOp;
2319   Instruction::BinaryOps MulOp;
2320   if (ScalarIVTy->isIntegerTy()) {
2321     AddOp = Instruction::Add;
2322     MulOp = Instruction::Mul;
2323   } else {
2324     AddOp = ID.getInductionOpcode();
2325     MulOp = Instruction::FMul;
2326   }
2327 
2328   // Determine the number of scalars we need to generate for each unroll
2329   // iteration. If EntryVal is uniform, we only need to generate the first
2330   // lane. Otherwise, we generate all VF values.
2331   unsigned Lanes =
2332       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2333           ? 1
2334           : VF.getKnownMinValue();
2335   assert((!VF.isScalable() || Lanes == 1) &&
2336          "Should never scalarize a scalable vector");
2337   // Compute the scalar steps and save the results in VectorLoopValueMap.
2338   for (unsigned Part = 0; Part < UF; ++Part) {
2339     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2340       auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2341                                          ScalarIVTy->getScalarSizeInBits());
2342       Value *StartIdx =
2343           createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2344       if (ScalarIVTy->isFloatingPointTy())
2345         StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
2346       StartIdx = addFastMathFlag(Builder.CreateBinOp(
2347           AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
2348       // The step returned by `createStepForVF` is a runtime-evaluated value
2349       // when VF is scalable. Otherwise, it should be folded into a Constant.
2350       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2351              "Expected StartIdx to be folded to a constant when VF is not "
2352              "scalable");
2353       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2354       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2355       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2356       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2357     }
2358   }
2359 }
2360 
2361 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2362   assert(V != Induction && "The new induction variable should not be used.");
2363   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2364   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2365 
2366   // If we have a stride that is replaced by one, do it here. Defer this for
2367   // the VPlan-native path until we start running Legal checks in that path.
2368   if (!EnableVPlanNativePath && Legal->hasStride(V))
2369     V = ConstantInt::get(V->getType(), 1);
2370 
2371   // If we have a vector mapped to this value, return it.
2372   if (VectorLoopValueMap.hasVectorValue(V, Part))
2373     return VectorLoopValueMap.getVectorValue(V, Part);
2374 
2375   // If the value has not been vectorized, check if it has been scalarized
2376   // instead. If it has been scalarized, and we actually need the value in
2377   // vector form, we will construct the vector values on demand.
2378   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2379     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2380 
2381     // If we've scalarized a value, that value should be an instruction.
2382     auto *I = cast<Instruction>(V);
2383 
2384     // If we aren't vectorizing, we can just copy the scalar map values over to
2385     // the vector map.
2386     if (VF.isScalar()) {
2387       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2388       return ScalarValue;
2389     }
2390 
2391     // Get the last scalar instruction we generated for V and Part. If the value
2392     // is known to be uniform after vectorization, this corresponds to lane zero
2393     // of the Part unroll iteration. Otherwise, the last instruction is the one
2394     // we created for the last vector lane of the Part unroll iteration.
2395     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2396                             ? 0
2397                             : VF.getKnownMinValue() - 1;
2398     assert((!VF.isScalable() || LastLane == 0) &&
2399            "Scalable vectorization can't lead to any scalarized values.");
2400     auto *LastInst = cast<Instruction>(
2401         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2402 
2403     // Set the insert point after the last scalarized instruction. This ensures
2404     // the insertelement sequence will directly follow the scalar definitions.
2405     auto OldIP = Builder.saveIP();
2406     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2407     Builder.SetInsertPoint(&*NewIP);
2408 
2409     // However, if we are vectorizing, we need to construct the vector values.
2410     // If the value is known to be uniform after vectorization, we can just
2411     // broadcast the scalar value corresponding to lane zero for each unroll
2412     // iteration. Otherwise, we construct the vector values using insertelement
2413     // instructions. Since the resulting vectors are stored in
2414     // VectorLoopValueMap, we will only generate the insertelements once.
2415     Value *VectorValue = nullptr;
2416     if (Cost->isUniformAfterVectorization(I, VF)) {
2417       VectorValue = getBroadcastInstrs(ScalarValue);
2418       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2419     } else {
2420       // Initialize packing with insertelements to start from poison.
2421       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2422       Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF));
2423       VectorLoopValueMap.setVectorValue(V, Part, Poison);
2424       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2425         packScalarIntoVectorValue(V, {Part, Lane});
2426       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2427     }
2428     Builder.restoreIP(OldIP);
2429     return VectorValue;
2430   }
2431 
2432   // If this scalar is unknown, assume that it is a constant or that it is
2433   // loop invariant. Broadcast V and save the value for future uses.
2434   Value *B = getBroadcastInstrs(V);
2435   VectorLoopValueMap.setVectorValue(V, Part, B);
2436   return B;
2437 }
2438 
2439 Value *
2440 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2441                                             const VPIteration &Instance) {
2442   // If the value is not an instruction contained in the loop, it should
2443   // already be scalar.
2444   if (OrigLoop->isLoopInvariant(V))
2445     return V;
2446 
2447   assert(Instance.Lane > 0
2448              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2449              : true && "Uniform values only have lane zero");
2450 
2451   // If the value from the original loop has not been vectorized, it is
2452   // represented by UF x VF scalar values in the new loop. Return the requested
2453   // scalar value.
2454   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2455     return VectorLoopValueMap.getScalarValue(V, Instance);
2456 
2457   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2458   // for the given unroll part. If this entry is not a vector type (i.e., the
2459   // vectorization factor is one), there is no need to generate an
2460   // extractelement instruction.
2461   auto *U = getOrCreateVectorValue(V, Instance.Part);
2462   if (!U->getType()->isVectorTy()) {
2463     assert(VF.isScalar() && "Value not scalarized has non-vector type");
2464     return U;
2465   }
2466 
2467   // Otherwise, the value from the original loop has been vectorized and is
2468   // represented by UF vector values. Extract and return the requested scalar
2469   // value from the appropriate vector lane.
2470   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2471 }
2472 
2473 void InnerLoopVectorizer::packScalarIntoVectorValue(
2474     Value *V, const VPIteration &Instance) {
2475   assert(V != Induction && "The new induction variable should not be used.");
2476   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2477   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2478 
2479   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2480   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2481   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2482                                             Builder.getInt32(Instance.Lane));
2483   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2484 }
2485 
2486 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2487   assert(Vec->getType()->isVectorTy() && "Invalid type");
2488   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2489   SmallVector<int, 8> ShuffleMask;
2490   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2491     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2492 
2493   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2494 }
2495 
2496 // Return whether we allow using masked interleave-groups (for dealing with
2497 // strided loads/stores that reside in predicated blocks, or for dealing
2498 // with gaps).
2499 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2500   // If an override option has been passed in for interleaved accesses, use it.
2501   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2502     return EnableMaskedInterleavedMemAccesses;
2503 
2504   return TTI.enableMaskedInterleavedAccessVectorization();
2505 }
2506 
2507 // Try to vectorize the interleave group that \p Instr belongs to.
2508 //
2509 // E.g. Translate following interleaved load group (factor = 3):
2510 //   for (i = 0; i < N; i+=3) {
2511 //     R = Pic[i];             // Member of index 0
2512 //     G = Pic[i+1];           // Member of index 1
2513 //     B = Pic[i+2];           // Member of index 2
2514 //     ... // do something to R, G, B
2515 //   }
2516 // To:
2517 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2518 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2519 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2520 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2521 //
2522 // Or translate following interleaved store group (factor = 3):
2523 //   for (i = 0; i < N; i+=3) {
2524 //     ... do something to R, G, B
2525 //     Pic[i]   = R;           // Member of index 0
2526 //     Pic[i+1] = G;           // Member of index 1
2527 //     Pic[i+2] = B;           // Member of index 2
2528 //   }
2529 // To:
2530 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2531 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2532 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2533 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2534 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2535 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2536     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2537     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2538     VPValue *BlockInMask) {
2539   Instruction *Instr = Group->getInsertPos();
2540   const DataLayout &DL = Instr->getModule()->getDataLayout();
2541 
2542   // Prepare for the vector type of the interleaved load/store.
2543   Type *ScalarTy = getMemInstValueType(Instr);
2544   unsigned InterleaveFactor = Group->getFactor();
2545   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2546   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2547 
2548   // Prepare for the new pointers.
2549   SmallVector<Value *, 2> AddrParts;
2550   unsigned Index = Group->getIndex(Instr);
2551 
2552   // TODO: extend the masked interleaved-group support to reversed access.
2553   assert((!BlockInMask || !Group->isReverse()) &&
2554          "Reversed masked interleave-group not supported.");
2555 
2556   // If the group is reverse, adjust the index to refer to the last vector lane
2557   // instead of the first. We adjust the index from the first vector lane,
2558   // rather than directly getting the pointer for lane VF - 1, because the
2559   // pointer operand of the interleaved access is supposed to be uniform. For
2560   // uniform instructions, we're only required to generate a value for the
2561   // first vector lane in each unroll iteration.
2562   assert(!VF.isScalable() &&
2563          "scalable vector reverse operation is not implemented");
2564   if (Group->isReverse())
2565     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2566 
2567   for (unsigned Part = 0; Part < UF; Part++) {
2568     Value *AddrPart = State.get(Addr, {Part, 0});
2569     setDebugLocFromInst(Builder, AddrPart);
2570 
2571     // Notice current instruction could be any index. Need to adjust the address
2572     // to the member of index 0.
2573     //
2574     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2575     //       b = A[i];       // Member of index 0
2576     // Current pointer is pointed to A[i+1], adjust it to A[i].
2577     //
2578     // E.g.  A[i+1] = a;     // Member of index 1
2579     //       A[i]   = b;     // Member of index 0
2580     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2581     // Current pointer is pointed to A[i+2], adjust it to A[i].
2582 
2583     bool InBounds = false;
2584     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2585       InBounds = gep->isInBounds();
2586     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2587     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2588 
2589     // Cast to the vector pointer type.
2590     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2591     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2592     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2593   }
2594 
2595   setDebugLocFromInst(Builder, Instr);
2596   Value *PoisonVec = PoisonValue::get(VecTy);
2597 
2598   Value *MaskForGaps = nullptr;
2599   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2600     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2601     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2602     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2603   }
2604 
2605   // Vectorize the interleaved load group.
2606   if (isa<LoadInst>(Instr)) {
2607     // For each unroll part, create a wide load for the group.
2608     SmallVector<Value *, 2> NewLoads;
2609     for (unsigned Part = 0; Part < UF; Part++) {
2610       Instruction *NewLoad;
2611       if (BlockInMask || MaskForGaps) {
2612         assert(useMaskedInterleavedAccesses(*TTI) &&
2613                "masked interleaved groups are not allowed.");
2614         Value *GroupMask = MaskForGaps;
2615         if (BlockInMask) {
2616           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2617           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2618           Value *ShuffledMask = Builder.CreateShuffleVector(
2619               BlockInMaskPart,
2620               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2621               "interleaved.mask");
2622           GroupMask = MaskForGaps
2623                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2624                                                 MaskForGaps)
2625                           : ShuffledMask;
2626         }
2627         NewLoad =
2628             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2629                                      GroupMask, PoisonVec, "wide.masked.vec");
2630       }
2631       else
2632         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2633                                             Group->getAlign(), "wide.vec");
2634       Group->addMetadata(NewLoad);
2635       NewLoads.push_back(NewLoad);
2636     }
2637 
2638     // For each member in the group, shuffle out the appropriate data from the
2639     // wide loads.
2640     unsigned J = 0;
2641     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2642       Instruction *Member = Group->getMember(I);
2643 
2644       // Skip the gaps in the group.
2645       if (!Member)
2646         continue;
2647 
2648       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2649       auto StrideMask =
2650           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2651       for (unsigned Part = 0; Part < UF; Part++) {
2652         Value *StridedVec = Builder.CreateShuffleVector(
2653             NewLoads[Part], StrideMask, "strided.vec");
2654 
2655         // If this member has different type, cast the result type.
2656         if (Member->getType() != ScalarTy) {
2657           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2658           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2659           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2660         }
2661 
2662         if (Group->isReverse())
2663           StridedVec = reverseVector(StridedVec);
2664 
2665         State.set(VPDefs[J], Member, StridedVec, Part);
2666       }
2667       ++J;
2668     }
2669     return;
2670   }
2671 
2672   // The sub vector type for current instruction.
2673   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2674   auto *SubVT = VectorType::get(ScalarTy, VF);
2675 
2676   // Vectorize the interleaved store group.
2677   for (unsigned Part = 0; Part < UF; Part++) {
2678     // Collect the stored vector from each member.
2679     SmallVector<Value *, 4> StoredVecs;
2680     for (unsigned i = 0; i < InterleaveFactor; i++) {
2681       // Interleaved store group doesn't allow a gap, so each index has a member
2682       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2683 
2684       Value *StoredVec = State.get(StoredValues[i], Part);
2685 
2686       if (Group->isReverse())
2687         StoredVec = reverseVector(StoredVec);
2688 
2689       // If this member has different type, cast it to a unified type.
2690 
2691       if (StoredVec->getType() != SubVT)
2692         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2693 
2694       StoredVecs.push_back(StoredVec);
2695     }
2696 
2697     // Concatenate all vectors into a wide vector.
2698     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2699 
2700     // Interleave the elements in the wide vector.
2701     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2702     Value *IVec = Builder.CreateShuffleVector(
2703         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2704         "interleaved.vec");
2705 
2706     Instruction *NewStoreInstr;
2707     if (BlockInMask) {
2708       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2709       Value *ShuffledMask = Builder.CreateShuffleVector(
2710           BlockInMaskPart,
2711           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2712           "interleaved.mask");
2713       NewStoreInstr = Builder.CreateMaskedStore(
2714           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2715     }
2716     else
2717       NewStoreInstr =
2718           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2719 
2720     Group->addMetadata(NewStoreInstr);
2721   }
2722 }
2723 
2724 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2725     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2726     VPValue *StoredValue, VPValue *BlockInMask) {
2727   // Attempt to issue a wide load.
2728   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2729   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2730 
2731   assert((LI || SI) && "Invalid Load/Store instruction");
2732   assert((!SI || StoredValue) && "No stored value provided for widened store");
2733   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2734 
2735   LoopVectorizationCostModel::InstWidening Decision =
2736       Cost->getWideningDecision(Instr, VF);
2737   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2738           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2739           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2740          "CM decision is not to widen the memory instruction");
2741 
2742   Type *ScalarDataTy = getMemInstValueType(Instr);
2743 
2744   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2745   const Align Alignment = getLoadStoreAlignment(Instr);
2746 
2747   // Determine if the pointer operand of the access is either consecutive or
2748   // reverse consecutive.
2749   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2750   bool ConsecutiveStride =
2751       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2752   bool CreateGatherScatter =
2753       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2754 
2755   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2756   // gather/scatter. Otherwise Decision should have been to Scalarize.
2757   assert((ConsecutiveStride || CreateGatherScatter) &&
2758          "The instruction should be scalarized");
2759   (void)ConsecutiveStride;
2760 
2761   VectorParts BlockInMaskParts(UF);
2762   bool isMaskRequired = BlockInMask;
2763   if (isMaskRequired)
2764     for (unsigned Part = 0; Part < UF; ++Part)
2765       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2766 
2767   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2768     // Calculate the pointer for the specific unroll-part.
2769     GetElementPtrInst *PartPtr = nullptr;
2770 
2771     bool InBounds = false;
2772     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2773       InBounds = gep->isInBounds();
2774 
2775     if (Reverse) {
2776       assert(!VF.isScalable() &&
2777              "Reversing vectors is not yet supported for scalable vectors.");
2778 
2779       // If the address is consecutive but reversed, then the
2780       // wide store needs to start at the last vector element.
2781       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2782           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2783       PartPtr->setIsInBounds(InBounds);
2784       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2785           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2786       PartPtr->setIsInBounds(InBounds);
2787       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2788         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2789     } else {
2790       Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2791       PartPtr = cast<GetElementPtrInst>(
2792           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2793       PartPtr->setIsInBounds(InBounds);
2794     }
2795 
2796     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2797     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2798   };
2799 
2800   // Handle Stores:
2801   if (SI) {
2802     setDebugLocFromInst(Builder, SI);
2803 
2804     for (unsigned Part = 0; Part < UF; ++Part) {
2805       Instruction *NewSI = nullptr;
2806       Value *StoredVal = State.get(StoredValue, Part);
2807       if (CreateGatherScatter) {
2808         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2809         Value *VectorGep = State.get(Addr, Part);
2810         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2811                                             MaskPart);
2812       } else {
2813         if (Reverse) {
2814           // If we store to reverse consecutive memory locations, then we need
2815           // to reverse the order of elements in the stored value.
2816           StoredVal = reverseVector(StoredVal);
2817           // We don't want to update the value in the map as it might be used in
2818           // another expression. So don't call resetVectorValue(StoredVal).
2819         }
2820         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2821         if (isMaskRequired)
2822           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2823                                             BlockInMaskParts[Part]);
2824         else
2825           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2826       }
2827       addMetadata(NewSI, SI);
2828     }
2829     return;
2830   }
2831 
2832   // Handle loads.
2833   assert(LI && "Must have a load instruction");
2834   setDebugLocFromInst(Builder, LI);
2835   for (unsigned Part = 0; Part < UF; ++Part) {
2836     Value *NewLI;
2837     if (CreateGatherScatter) {
2838       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2839       Value *VectorGep = State.get(Addr, Part);
2840       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2841                                          nullptr, "wide.masked.gather");
2842       addMetadata(NewLI, LI);
2843     } else {
2844       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2845       if (isMaskRequired)
2846         NewLI = Builder.CreateMaskedLoad(
2847             VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
2848             "wide.masked.load");
2849       else
2850         NewLI =
2851             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2852 
2853       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2854       addMetadata(NewLI, LI);
2855       if (Reverse)
2856         NewLI = reverseVector(NewLI);
2857     }
2858 
2859     State.set(Def, Instr, NewLI, Part);
2860   }
2861 }
2862 
2863 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2864                                                const VPIteration &Instance,
2865                                                bool IfPredicateInstr,
2866                                                VPTransformState &State) {
2867   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2868 
2869   setDebugLocFromInst(Builder, Instr);
2870 
2871   // Does this instruction return a value ?
2872   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2873 
2874   Instruction *Cloned = Instr->clone();
2875   if (!IsVoidRetTy)
2876     Cloned->setName(Instr->getName() + ".cloned");
2877 
2878   // Replace the operands of the cloned instructions with their scalar
2879   // equivalents in the new loop.
2880   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2881     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2882     auto InputInstance = Instance;
2883     if (!Operand || !OrigLoop->contains(Operand) ||
2884         (Cost->isUniformAfterVectorization(Operand, State.VF)))
2885       InputInstance.Lane = 0;
2886     auto *NewOp = State.get(User.getOperand(op), InputInstance);
2887     Cloned->setOperand(op, NewOp);
2888   }
2889   addNewMetadata(Cloned, Instr);
2890 
2891   // Place the cloned scalar in the new loop.
2892   Builder.Insert(Cloned);
2893 
2894   // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
2895   // representing scalar values in VPTransformState. Add the cloned scalar to
2896   // the scalar map entry.
2897   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2898 
2899   // If we just cloned a new assumption, add it the assumption cache.
2900   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2901     if (II->getIntrinsicID() == Intrinsic::assume)
2902       AC->registerAssumption(II);
2903 
2904   // End if-block.
2905   if (IfPredicateInstr)
2906     PredicatedInstructions.push_back(Cloned);
2907 }
2908 
2909 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2910                                                       Value *End, Value *Step,
2911                                                       Instruction *DL) {
2912   BasicBlock *Header = L->getHeader();
2913   BasicBlock *Latch = L->getLoopLatch();
2914   // As we're just creating this loop, it's possible no latch exists
2915   // yet. If so, use the header as this will be a single block loop.
2916   if (!Latch)
2917     Latch = Header;
2918 
2919   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2920   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2921   setDebugLocFromInst(Builder, OldInst);
2922   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2923 
2924   Builder.SetInsertPoint(Latch->getTerminator());
2925   setDebugLocFromInst(Builder, OldInst);
2926 
2927   // Create i+1 and fill the PHINode.
2928   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2929   Induction->addIncoming(Start, L->getLoopPreheader());
2930   Induction->addIncoming(Next, Latch);
2931   // Create the compare.
2932   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2933   Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
2934 
2935   // Now we have two terminators. Remove the old one from the block.
2936   Latch->getTerminator()->eraseFromParent();
2937 
2938   return Induction;
2939 }
2940 
2941 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2942   if (TripCount)
2943     return TripCount;
2944 
2945   assert(L && "Create Trip Count for null loop.");
2946   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2947   // Find the loop boundaries.
2948   ScalarEvolution *SE = PSE.getSE();
2949   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2950   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2951          "Invalid loop count");
2952 
2953   Type *IdxTy = Legal->getWidestInductionType();
2954   assert(IdxTy && "No type for induction");
2955 
2956   // The exit count might have the type of i64 while the phi is i32. This can
2957   // happen if we have an induction variable that is sign extended before the
2958   // compare. The only way that we get a backedge taken count is that the
2959   // induction variable was signed and as such will not overflow. In such a case
2960   // truncation is legal.
2961   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2962       IdxTy->getPrimitiveSizeInBits())
2963     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2964   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2965 
2966   // Get the total trip count from the count by adding 1.
2967   const SCEV *ExitCount = SE->getAddExpr(
2968       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2969 
2970   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2971 
2972   // Expand the trip count and place the new instructions in the preheader.
2973   // Notice that the pre-header does not change, only the loop body.
2974   SCEVExpander Exp(*SE, DL, "induction");
2975 
2976   // Count holds the overall loop count (N).
2977   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2978                                 L->getLoopPreheader()->getTerminator());
2979 
2980   if (TripCount->getType()->isPointerTy())
2981     TripCount =
2982         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2983                                     L->getLoopPreheader()->getTerminator());
2984 
2985   return TripCount;
2986 }
2987 
2988 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2989   if (VectorTripCount)
2990     return VectorTripCount;
2991 
2992   Value *TC = getOrCreateTripCount(L);
2993   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2994 
2995   Type *Ty = TC->getType();
2996   // This is where we can make the step a runtime constant.
2997   Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
2998 
2999   // If the tail is to be folded by masking, round the number of iterations N
3000   // up to a multiple of Step instead of rounding down. This is done by first
3001   // adding Step-1 and then rounding down. Note that it's ok if this addition
3002   // overflows: the vector induction variable will eventually wrap to zero given
3003   // that it starts at zero and its Step is a power of two; the loop will then
3004   // exit, with the last early-exit vector comparison also producing all-true.
3005   if (Cost->foldTailByMasking()) {
3006     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3007            "VF*UF must be a power of 2 when folding tail by masking");
3008     assert(!VF.isScalable() &&
3009            "Tail folding not yet supported for scalable vectors");
3010     TC = Builder.CreateAdd(
3011         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3012   }
3013 
3014   // Now we need to generate the expression for the part of the loop that the
3015   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3016   // iterations are not required for correctness, or N - Step, otherwise. Step
3017   // is equal to the vectorization factor (number of SIMD elements) times the
3018   // unroll factor (number of SIMD instructions).
3019   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3020 
3021   // There are two cases where we need to ensure (at least) the last iteration
3022   // runs in the scalar remainder loop. Thus, if the step evenly divides
3023   // the trip count, we set the remainder to be equal to the step. If the step
3024   // does not evenly divide the trip count, no adjustment is necessary since
3025   // there will already be scalar iterations. Note that the minimum iterations
3026   // check ensures that N >= Step. The cases are:
3027   // 1) If there is a non-reversed interleaved group that may speculatively
3028   //    access memory out-of-bounds.
3029   // 2) If any instruction may follow a conditionally taken exit. That is, if
3030   //    the loop contains multiple exiting blocks, or a single exiting block
3031   //    which is not the latch.
3032   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3033     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3034     R = Builder.CreateSelect(IsZero, Step, R);
3035   }
3036 
3037   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3038 
3039   return VectorTripCount;
3040 }
3041 
3042 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3043                                                    const DataLayout &DL) {
3044   // Verify that V is a vector type with same number of elements as DstVTy.
3045   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3046   unsigned VF = DstFVTy->getNumElements();
3047   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3048   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3049   Type *SrcElemTy = SrcVecTy->getElementType();
3050   Type *DstElemTy = DstFVTy->getElementType();
3051   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3052          "Vector elements must have same size");
3053 
3054   // Do a direct cast if element types are castable.
3055   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3056     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3057   }
3058   // V cannot be directly casted to desired vector type.
3059   // May happen when V is a floating point vector but DstVTy is a vector of
3060   // pointers or vice-versa. Handle this using a two-step bitcast using an
3061   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3062   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3063          "Only one type should be a pointer type");
3064   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3065          "Only one type should be a floating point type");
3066   Type *IntTy =
3067       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3068   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3069   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3070   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3071 }
3072 
3073 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3074                                                          BasicBlock *Bypass) {
3075   Value *Count = getOrCreateTripCount(L);
3076   // Reuse existing vector loop preheader for TC checks.
3077   // Note that new preheader block is generated for vector loop.
3078   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3079   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3080 
3081   // Generate code to check if the loop's trip count is less than VF * UF, or
3082   // equal to it in case a scalar epilogue is required; this implies that the
3083   // vector trip count is zero. This check also covers the case where adding one
3084   // to the backedge-taken count overflowed leading to an incorrect trip count
3085   // of zero. In this case we will also jump to the scalar loop.
3086   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3087                                           : ICmpInst::ICMP_ULT;
3088 
3089   // If tail is to be folded, vector loop takes care of all iterations.
3090   Value *CheckMinIters = Builder.getFalse();
3091   if (!Cost->foldTailByMasking()) {
3092     Value *Step =
3093         createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3094     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3095   }
3096   // Create new preheader for vector loop.
3097   LoopVectorPreHeader =
3098       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3099                  "vector.ph");
3100 
3101   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3102                                DT->getNode(Bypass)->getIDom()) &&
3103          "TC check is expected to dominate Bypass");
3104 
3105   // Update dominator for Bypass & LoopExit.
3106   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3107   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3108 
3109   ReplaceInstWithInst(
3110       TCCheckBlock->getTerminator(),
3111       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3112   LoopBypassBlocks.push_back(TCCheckBlock);
3113 }
3114 
3115 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3116   // Reuse existing vector loop preheader for SCEV checks.
3117   // Note that new preheader block is generated for vector loop.
3118   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
3119 
3120   // Generate the code to check that the SCEV assumptions that we made.
3121   // We want the new basic block to start at the first instruction in a
3122   // sequence of instructions that form a check.
3123   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3124                    "scev.check");
3125   Value *SCEVCheck = Exp.expandCodeForPredicate(
3126       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
3127 
3128   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3129     if (C->isZero())
3130       return;
3131 
3132   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3133            (OptForSizeBasedOnProfile &&
3134             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3135          "Cannot SCEV check stride or overflow when optimizing for size");
3136 
3137   SCEVCheckBlock->setName("vector.scevcheck");
3138   // Create new preheader for vector loop.
3139   LoopVectorPreHeader =
3140       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
3141                  nullptr, "vector.ph");
3142 
3143   // Update dominator only if this is first RT check.
3144   if (LoopBypassBlocks.empty()) {
3145     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3146     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3147   }
3148 
3149   ReplaceInstWithInst(
3150       SCEVCheckBlock->getTerminator(),
3151       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
3152   LoopBypassBlocks.push_back(SCEVCheckBlock);
3153   AddedSafetyChecks = true;
3154 }
3155 
3156 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3157   // VPlan-native path does not do any analysis for runtime checks currently.
3158   if (EnableVPlanNativePath)
3159     return;
3160 
3161   // Reuse existing vector loop preheader for runtime memory checks.
3162   // Note that new preheader block is generated for vector loop.
3163   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
3164 
3165   // Generate the code that checks in runtime if arrays overlap. We put the
3166   // checks into a separate block to make the more common case of few elements
3167   // faster.
3168   auto *LAI = Legal->getLAI();
3169   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
3170   if (!RtPtrChecking.Need)
3171     return;
3172 
3173   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3174     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3175            "Cannot emit memory checks when optimizing for size, unless forced "
3176            "to vectorize.");
3177     ORE->emit([&]() {
3178       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3179                                         L->getStartLoc(), L->getHeader())
3180              << "Code-size may be reduced by not forcing "
3181                 "vectorization, or by source-code modifications "
3182                 "eliminating the need for runtime checks "
3183                 "(e.g., adding 'restrict').";
3184     });
3185   }
3186 
3187   MemCheckBlock->setName("vector.memcheck");
3188   // Create new preheader for vector loop.
3189   LoopVectorPreHeader =
3190       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
3191                  "vector.ph");
3192 
3193   auto *CondBranch = cast<BranchInst>(
3194       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
3195   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
3196   LoopBypassBlocks.push_back(MemCheckBlock);
3197   AddedSafetyChecks = true;
3198 
3199   // Update dominator only if this is first RT check.
3200   if (LoopBypassBlocks.empty()) {
3201     DT->changeImmediateDominator(Bypass, MemCheckBlock);
3202     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
3203   }
3204 
3205   Instruction *FirstCheckInst;
3206   Instruction *MemRuntimeCheck;
3207   std::tie(FirstCheckInst, MemRuntimeCheck) =
3208       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
3209                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
3210   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
3211                             "claimed checks are required");
3212   CondBranch->setCondition(MemRuntimeCheck);
3213 
3214   // We currently don't use LoopVersioning for the actual loop cloning but we
3215   // still use it to add the noalias metadata.
3216   LVer = std::make_unique<LoopVersioning>(
3217       *Legal->getLAI(),
3218       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3219       DT, PSE.getSE());
3220   LVer->prepareNoAliasMetadata();
3221 }
3222 
3223 Value *InnerLoopVectorizer::emitTransformedIndex(
3224     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3225     const InductionDescriptor &ID) const {
3226 
3227   SCEVExpander Exp(*SE, DL, "induction");
3228   auto Step = ID.getStep();
3229   auto StartValue = ID.getStartValue();
3230   assert(Index->getType() == Step->getType() &&
3231          "Index type does not match StepValue type");
3232 
3233   // Note: the IR at this point is broken. We cannot use SE to create any new
3234   // SCEV and then expand it, hoping that SCEV's simplification will give us
3235   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3236   // lead to various SCEV crashes. So all we can do is to use builder and rely
3237   // on InstCombine for future simplifications. Here we handle some trivial
3238   // cases only.
3239   auto CreateAdd = [&B](Value *X, Value *Y) {
3240     assert(X->getType() == Y->getType() && "Types don't match!");
3241     if (auto *CX = dyn_cast<ConstantInt>(X))
3242       if (CX->isZero())
3243         return Y;
3244     if (auto *CY = dyn_cast<ConstantInt>(Y))
3245       if (CY->isZero())
3246         return X;
3247     return B.CreateAdd(X, Y);
3248   };
3249 
3250   auto CreateMul = [&B](Value *X, Value *Y) {
3251     assert(X->getType() == Y->getType() && "Types don't match!");
3252     if (auto *CX = dyn_cast<ConstantInt>(X))
3253       if (CX->isOne())
3254         return Y;
3255     if (auto *CY = dyn_cast<ConstantInt>(Y))
3256       if (CY->isOne())
3257         return X;
3258     return B.CreateMul(X, Y);
3259   };
3260 
3261   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3262   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3263   // the DomTree is not kept up-to-date for additional blocks generated in the
3264   // vector loop. By using the header as insertion point, we guarantee that the
3265   // expanded instructions dominate all their uses.
3266   auto GetInsertPoint = [this, &B]() {
3267     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3268     if (InsertBB != LoopVectorBody &&
3269         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3270       return LoopVectorBody->getTerminator();
3271     return &*B.GetInsertPoint();
3272   };
3273   switch (ID.getKind()) {
3274   case InductionDescriptor::IK_IntInduction: {
3275     assert(Index->getType() == StartValue->getType() &&
3276            "Index type does not match StartValue type");
3277     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3278       return B.CreateSub(StartValue, Index);
3279     auto *Offset = CreateMul(
3280         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3281     return CreateAdd(StartValue, Offset);
3282   }
3283   case InductionDescriptor::IK_PtrInduction: {
3284     assert(isa<SCEVConstant>(Step) &&
3285            "Expected constant step for pointer induction");
3286     return B.CreateGEP(
3287         StartValue->getType()->getPointerElementType(), StartValue,
3288         CreateMul(Index,
3289                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3290   }
3291   case InductionDescriptor::IK_FpInduction: {
3292     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3293     auto InductionBinOp = ID.getInductionBinOp();
3294     assert(InductionBinOp &&
3295            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3296             InductionBinOp->getOpcode() == Instruction::FSub) &&
3297            "Original bin op should be defined for FP induction");
3298 
3299     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3300 
3301     // Floating point operations had to be 'fast' to enable the induction.
3302     FastMathFlags Flags;
3303     Flags.setFast();
3304 
3305     Value *MulExp = B.CreateFMul(StepValue, Index);
3306     if (isa<Instruction>(MulExp))
3307       // We have to check, the MulExp may be a constant.
3308       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3309 
3310     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3311                                "induction");
3312     if (isa<Instruction>(BOp))
3313       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3314 
3315     return BOp;
3316   }
3317   case InductionDescriptor::IK_NoInduction:
3318     return nullptr;
3319   }
3320   llvm_unreachable("invalid enum");
3321 }
3322 
3323 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3324   LoopScalarBody = OrigLoop->getHeader();
3325   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3326   LoopExitBlock = OrigLoop->getUniqueExitBlock();
3327   assert(LoopExitBlock && "Must have an exit block");
3328   assert(LoopVectorPreHeader && "Invalid loop structure");
3329 
3330   LoopMiddleBlock =
3331       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3332                  LI, nullptr, Twine(Prefix) + "middle.block");
3333   LoopScalarPreHeader =
3334       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3335                  nullptr, Twine(Prefix) + "scalar.ph");
3336 
3337   // Set up branch from middle block to the exit and scalar preheader blocks.
3338   // completeLoopSkeleton will update the condition to use an iteration check,
3339   // if required to decide whether to execute the remainder.
3340   BranchInst *BrInst =
3341       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
3342   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3343   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3344   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3345 
3346   // We intentionally don't let SplitBlock to update LoopInfo since
3347   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3348   // LoopVectorBody is explicitly added to the correct place few lines later.
3349   LoopVectorBody =
3350       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3351                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3352 
3353   // Update dominator for loop exit.
3354   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3355 
3356   // Create and register the new vector loop.
3357   Loop *Lp = LI->AllocateLoop();
3358   Loop *ParentLoop = OrigLoop->getParentLoop();
3359 
3360   // Insert the new loop into the loop nest and register the new basic blocks
3361   // before calling any utilities such as SCEV that require valid LoopInfo.
3362   if (ParentLoop) {
3363     ParentLoop->addChildLoop(Lp);
3364   } else {
3365     LI->addTopLevelLoop(Lp);
3366   }
3367   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3368   return Lp;
3369 }
3370 
3371 void InnerLoopVectorizer::createInductionResumeValues(
3372     Loop *L, Value *VectorTripCount,
3373     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3374   assert(VectorTripCount && L && "Expected valid arguments");
3375   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3376           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3377          "Inconsistent information about additional bypass.");
3378   // We are going to resume the execution of the scalar loop.
3379   // Go over all of the induction variables that we found and fix the
3380   // PHIs that are left in the scalar version of the loop.
3381   // The starting values of PHI nodes depend on the counter of the last
3382   // iteration in the vectorized loop.
3383   // If we come from a bypass edge then we need to start from the original
3384   // start value.
3385   for (auto &InductionEntry : Legal->getInductionVars()) {
3386     PHINode *OrigPhi = InductionEntry.first;
3387     InductionDescriptor II = InductionEntry.second;
3388 
3389     // Create phi nodes to merge from the  backedge-taken check block.
3390     PHINode *BCResumeVal =
3391         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3392                         LoopScalarPreHeader->getTerminator());
3393     // Copy original phi DL over to the new one.
3394     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3395     Value *&EndValue = IVEndValues[OrigPhi];
3396     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3397     if (OrigPhi == OldInduction) {
3398       // We know what the end value is.
3399       EndValue = VectorTripCount;
3400     } else {
3401       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3402       Type *StepType = II.getStep()->getType();
3403       Instruction::CastOps CastOp =
3404           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3405       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3406       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3407       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3408       EndValue->setName("ind.end");
3409 
3410       // Compute the end value for the additional bypass (if applicable).
3411       if (AdditionalBypass.first) {
3412         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3413         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3414                                          StepType, true);
3415         CRD =
3416             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3417         EndValueFromAdditionalBypass =
3418             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3419         EndValueFromAdditionalBypass->setName("ind.end");
3420       }
3421     }
3422     // The new PHI merges the original incoming value, in case of a bypass,
3423     // or the value at the end of the vectorized loop.
3424     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3425 
3426     // Fix the scalar body counter (PHI node).
3427     // The old induction's phi node in the scalar body needs the truncated
3428     // value.
3429     for (BasicBlock *BB : LoopBypassBlocks)
3430       BCResumeVal->addIncoming(II.getStartValue(), BB);
3431 
3432     if (AdditionalBypass.first)
3433       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3434                                             EndValueFromAdditionalBypass);
3435 
3436     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3437   }
3438 }
3439 
3440 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3441                                                       MDNode *OrigLoopID) {
3442   assert(L && "Expected valid loop.");
3443 
3444   // The trip counts should be cached by now.
3445   Value *Count = getOrCreateTripCount(L);
3446   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3447 
3448   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3449 
3450   // Add a check in the middle block to see if we have completed
3451   // all of the iterations in the first vector loop.
3452   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3453   // If tail is to be folded, we know we don't need to run the remainder.
3454   if (!Cost->foldTailByMasking()) {
3455     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3456                                         Count, VectorTripCount, "cmp.n",
3457                                         LoopMiddleBlock->getTerminator());
3458 
3459     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3460     // of the corresponding compare because they may have ended up with
3461     // different line numbers and we want to avoid awkward line stepping while
3462     // debugging. Eg. if the compare has got a line number inside the loop.
3463     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3464     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3465   }
3466 
3467   // Get ready to start creating new instructions into the vectorized body.
3468   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3469          "Inconsistent vector loop preheader");
3470   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3471 
3472   Optional<MDNode *> VectorizedLoopID =
3473       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3474                                       LLVMLoopVectorizeFollowupVectorized});
3475   if (VectorizedLoopID.hasValue()) {
3476     L->setLoopID(VectorizedLoopID.getValue());
3477 
3478     // Do not setAlreadyVectorized if loop attributes have been defined
3479     // explicitly.
3480     return LoopVectorPreHeader;
3481   }
3482 
3483   // Keep all loop hints from the original loop on the vector loop (we'll
3484   // replace the vectorizer-specific hints below).
3485   if (MDNode *LID = OrigLoop->getLoopID())
3486     L->setLoopID(LID);
3487 
3488   LoopVectorizeHints Hints(L, true, *ORE);
3489   Hints.setAlreadyVectorized();
3490 
3491 #ifdef EXPENSIVE_CHECKS
3492   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3493   LI->verify(*DT);
3494 #endif
3495 
3496   return LoopVectorPreHeader;
3497 }
3498 
3499 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3500   /*
3501    In this function we generate a new loop. The new loop will contain
3502    the vectorized instructions while the old loop will continue to run the
3503    scalar remainder.
3504 
3505        [ ] <-- loop iteration number check.
3506     /   |
3507    /    v
3508   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3509   |  /  |
3510   | /   v
3511   ||   [ ]     <-- vector pre header.
3512   |/    |
3513   |     v
3514   |    [  ] \
3515   |    [  ]_|   <-- vector loop.
3516   |     |
3517   |     v
3518   |   -[ ]   <--- middle-block.
3519   |  /  |
3520   | /   v
3521   -|- >[ ]     <--- new preheader.
3522    |    |
3523    |    v
3524    |   [ ] \
3525    |   [ ]_|   <-- old scalar loop to handle remainder.
3526     \   |
3527      \  v
3528       >[ ]     <-- exit block.
3529    ...
3530    */
3531 
3532   // Get the metadata of the original loop before it gets modified.
3533   MDNode *OrigLoopID = OrigLoop->getLoopID();
3534 
3535   // Create an empty vector loop, and prepare basic blocks for the runtime
3536   // checks.
3537   Loop *Lp = createVectorLoopSkeleton("");
3538 
3539   // Now, compare the new count to zero. If it is zero skip the vector loop and
3540   // jump to the scalar loop. This check also covers the case where the
3541   // backedge-taken count is uint##_max: adding one to it will overflow leading
3542   // to an incorrect trip count of zero. In this (rare) case we will also jump
3543   // to the scalar loop.
3544   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3545 
3546   // Generate the code to check any assumptions that we've made for SCEV
3547   // expressions.
3548   emitSCEVChecks(Lp, LoopScalarPreHeader);
3549 
3550   // Generate the code that checks in runtime if arrays overlap. We put the
3551   // checks into a separate block to make the more common case of few elements
3552   // faster.
3553   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3554 
3555   // Some loops have a single integer induction variable, while other loops
3556   // don't. One example is c++ iterators that often have multiple pointer
3557   // induction variables. In the code below we also support a case where we
3558   // don't have a single induction variable.
3559   //
3560   // We try to obtain an induction variable from the original loop as hard
3561   // as possible. However if we don't find one that:
3562   //   - is an integer
3563   //   - counts from zero, stepping by one
3564   //   - is the size of the widest induction variable type
3565   // then we create a new one.
3566   OldInduction = Legal->getPrimaryInduction();
3567   Type *IdxTy = Legal->getWidestInductionType();
3568   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3569   // The loop step is equal to the vectorization factor (num of SIMD elements)
3570   // times the unroll factor (num of SIMD instructions).
3571   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3572   Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3573   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3574   Induction =
3575       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3576                               getDebugLocFromInstOrOperands(OldInduction));
3577 
3578   // Emit phis for the new starting index of the scalar loop.
3579   createInductionResumeValues(Lp, CountRoundDown);
3580 
3581   return completeLoopSkeleton(Lp, OrigLoopID);
3582 }
3583 
3584 // Fix up external users of the induction variable. At this point, we are
3585 // in LCSSA form, with all external PHIs that use the IV having one input value,
3586 // coming from the remainder loop. We need those PHIs to also have a correct
3587 // value for the IV when arriving directly from the middle block.
3588 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3589                                        const InductionDescriptor &II,
3590                                        Value *CountRoundDown, Value *EndValue,
3591                                        BasicBlock *MiddleBlock) {
3592   // There are two kinds of external IV usages - those that use the value
3593   // computed in the last iteration (the PHI) and those that use the penultimate
3594   // value (the value that feeds into the phi from the loop latch).
3595   // We allow both, but they, obviously, have different values.
3596 
3597   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3598 
3599   DenseMap<Value *, Value *> MissingVals;
3600 
3601   // An external user of the last iteration's value should see the value that
3602   // the remainder loop uses to initialize its own IV.
3603   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3604   for (User *U : PostInc->users()) {
3605     Instruction *UI = cast<Instruction>(U);
3606     if (!OrigLoop->contains(UI)) {
3607       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3608       MissingVals[UI] = EndValue;
3609     }
3610   }
3611 
3612   // An external user of the penultimate value need to see EndValue - Step.
3613   // The simplest way to get this is to recompute it from the constituent SCEVs,
3614   // that is Start + (Step * (CRD - 1)).
3615   for (User *U : OrigPhi->users()) {
3616     auto *UI = cast<Instruction>(U);
3617     if (!OrigLoop->contains(UI)) {
3618       const DataLayout &DL =
3619           OrigLoop->getHeader()->getModule()->getDataLayout();
3620       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3621 
3622       IRBuilder<> B(MiddleBlock->getTerminator());
3623       Value *CountMinusOne = B.CreateSub(
3624           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3625       Value *CMO =
3626           !II.getStep()->getType()->isIntegerTy()
3627               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3628                              II.getStep()->getType())
3629               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3630       CMO->setName("cast.cmo");
3631       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3632       Escape->setName("ind.escape");
3633       MissingVals[UI] = Escape;
3634     }
3635   }
3636 
3637   for (auto &I : MissingVals) {
3638     PHINode *PHI = cast<PHINode>(I.first);
3639     // One corner case we have to handle is two IVs "chasing" each-other,
3640     // that is %IV2 = phi [...], [ %IV1, %latch ]
3641     // In this case, if IV1 has an external use, we need to avoid adding both
3642     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3643     // don't already have an incoming value for the middle block.
3644     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3645       PHI->addIncoming(I.second, MiddleBlock);
3646   }
3647 }
3648 
3649 namespace {
3650 
3651 struct CSEDenseMapInfo {
3652   static bool canHandle(const Instruction *I) {
3653     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3654            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3655   }
3656 
3657   static inline Instruction *getEmptyKey() {
3658     return DenseMapInfo<Instruction *>::getEmptyKey();
3659   }
3660 
3661   static inline Instruction *getTombstoneKey() {
3662     return DenseMapInfo<Instruction *>::getTombstoneKey();
3663   }
3664 
3665   static unsigned getHashValue(const Instruction *I) {
3666     assert(canHandle(I) && "Unknown instruction!");
3667     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3668                                                            I->value_op_end()));
3669   }
3670 
3671   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3672     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3673         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3674       return LHS == RHS;
3675     return LHS->isIdenticalTo(RHS);
3676   }
3677 };
3678 
3679 } // end anonymous namespace
3680 
3681 ///Perform cse of induction variable instructions.
3682 static void cse(BasicBlock *BB) {
3683   // Perform simple cse.
3684   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3685   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3686     Instruction *In = &*I++;
3687 
3688     if (!CSEDenseMapInfo::canHandle(In))
3689       continue;
3690 
3691     // Check if we can replace this instruction with any of the
3692     // visited instructions.
3693     if (Instruction *V = CSEMap.lookup(In)) {
3694       In->replaceAllUsesWith(V);
3695       In->eraseFromParent();
3696       continue;
3697     }
3698 
3699     CSEMap[In] = In;
3700   }
3701 }
3702 
3703 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3704                                                        ElementCount VF,
3705                                                        bool &NeedToScalarize) {
3706   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3707   Function *F = CI->getCalledFunction();
3708   Type *ScalarRetTy = CI->getType();
3709   SmallVector<Type *, 4> Tys, ScalarTys;
3710   for (auto &ArgOp : CI->arg_operands())
3711     ScalarTys.push_back(ArgOp->getType());
3712 
3713   // Estimate cost of scalarized vector call. The source operands are assumed
3714   // to be vectors, so we need to extract individual elements from there,
3715   // execute VF scalar calls, and then gather the result into the vector return
3716   // value.
3717   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3718                                                  TTI::TCK_RecipThroughput);
3719   if (VF.isScalar())
3720     return ScalarCallCost;
3721 
3722   // Compute corresponding vector type for return value and arguments.
3723   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3724   for (Type *ScalarTy : ScalarTys)
3725     Tys.push_back(ToVectorTy(ScalarTy, VF));
3726 
3727   // Compute costs of unpacking argument values for the scalar calls and
3728   // packing the return values to a vector.
3729   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3730 
3731   unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3732 
3733   // If we can't emit a vector call for this function, then the currently found
3734   // cost is the cost we need to return.
3735   NeedToScalarize = true;
3736   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3737   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3738 
3739   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3740     return Cost;
3741 
3742   // If the corresponding vector cost is cheaper, return its cost.
3743   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3744                                                  TTI::TCK_RecipThroughput);
3745   if (VectorCallCost < Cost) {
3746     NeedToScalarize = false;
3747     return VectorCallCost;
3748   }
3749   return Cost;
3750 }
3751 
3752 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3753                                                             ElementCount VF) {
3754   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3755   assert(ID && "Expected intrinsic call!");
3756 
3757   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3758   return TTI.getIntrinsicInstrCost(CostAttrs,
3759                                    TargetTransformInfo::TCK_RecipThroughput);
3760 }
3761 
3762 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3763   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3764   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3765   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3766 }
3767 
3768 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3769   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3770   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3771   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3772 }
3773 
3774 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3775   // For every instruction `I` in MinBWs, truncate the operands, create a
3776   // truncated version of `I` and reextend its result. InstCombine runs
3777   // later and will remove any ext/trunc pairs.
3778   SmallPtrSet<Value *, 4> Erased;
3779   for (const auto &KV : Cost->getMinimalBitwidths()) {
3780     // If the value wasn't vectorized, we must maintain the original scalar
3781     // type. The absence of the value from VectorLoopValueMap indicates that it
3782     // wasn't vectorized.
3783     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3784       continue;
3785     for (unsigned Part = 0; Part < UF; ++Part) {
3786       Value *I = getOrCreateVectorValue(KV.first, Part);
3787       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3788         continue;
3789       Type *OriginalTy = I->getType();
3790       Type *ScalarTruncatedTy =
3791           IntegerType::get(OriginalTy->getContext(), KV.second);
3792       auto *TruncatedTy = FixedVectorType::get(
3793           ScalarTruncatedTy,
3794           cast<FixedVectorType>(OriginalTy)->getNumElements());
3795       if (TruncatedTy == OriginalTy)
3796         continue;
3797 
3798       IRBuilder<> B(cast<Instruction>(I));
3799       auto ShrinkOperand = [&](Value *V) -> Value * {
3800         if (auto *ZI = dyn_cast<ZExtInst>(V))
3801           if (ZI->getSrcTy() == TruncatedTy)
3802             return ZI->getOperand(0);
3803         return B.CreateZExtOrTrunc(V, TruncatedTy);
3804       };
3805 
3806       // The actual instruction modification depends on the instruction type,
3807       // unfortunately.
3808       Value *NewI = nullptr;
3809       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3810         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3811                              ShrinkOperand(BO->getOperand(1)));
3812 
3813         // Any wrapping introduced by shrinking this operation shouldn't be
3814         // considered undefined behavior. So, we can't unconditionally copy
3815         // arithmetic wrapping flags to NewI.
3816         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3817       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3818         NewI =
3819             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3820                          ShrinkOperand(CI->getOperand(1)));
3821       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3822         NewI = B.CreateSelect(SI->getCondition(),
3823                               ShrinkOperand(SI->getTrueValue()),
3824                               ShrinkOperand(SI->getFalseValue()));
3825       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3826         switch (CI->getOpcode()) {
3827         default:
3828           llvm_unreachable("Unhandled cast!");
3829         case Instruction::Trunc:
3830           NewI = ShrinkOperand(CI->getOperand(0));
3831           break;
3832         case Instruction::SExt:
3833           NewI = B.CreateSExtOrTrunc(
3834               CI->getOperand(0),
3835               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3836           break;
3837         case Instruction::ZExt:
3838           NewI = B.CreateZExtOrTrunc(
3839               CI->getOperand(0),
3840               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3841           break;
3842         }
3843       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3844         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3845                              ->getNumElements();
3846         auto *O0 = B.CreateZExtOrTrunc(
3847             SI->getOperand(0),
3848             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3849         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3850                              ->getNumElements();
3851         auto *O1 = B.CreateZExtOrTrunc(
3852             SI->getOperand(1),
3853             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3854 
3855         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3856       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3857         // Don't do anything with the operands, just extend the result.
3858         continue;
3859       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3860         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3861                             ->getNumElements();
3862         auto *O0 = B.CreateZExtOrTrunc(
3863             IE->getOperand(0),
3864             FixedVectorType::get(ScalarTruncatedTy, Elements));
3865         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3866         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3867       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3868         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3869                             ->getNumElements();
3870         auto *O0 = B.CreateZExtOrTrunc(
3871             EE->getOperand(0),
3872             FixedVectorType::get(ScalarTruncatedTy, Elements));
3873         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3874       } else {
3875         // If we don't know what to do, be conservative and don't do anything.
3876         continue;
3877       }
3878 
3879       // Lastly, extend the result.
3880       NewI->takeName(cast<Instruction>(I));
3881       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3882       I->replaceAllUsesWith(Res);
3883       cast<Instruction>(I)->eraseFromParent();
3884       Erased.insert(I);
3885       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3886     }
3887   }
3888 
3889   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3890   for (const auto &KV : Cost->getMinimalBitwidths()) {
3891     // If the value wasn't vectorized, we must maintain the original scalar
3892     // type. The absence of the value from VectorLoopValueMap indicates that it
3893     // wasn't vectorized.
3894     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3895       continue;
3896     for (unsigned Part = 0; Part < UF; ++Part) {
3897       Value *I = getOrCreateVectorValue(KV.first, Part);
3898       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3899       if (Inst && Inst->use_empty()) {
3900         Value *NewI = Inst->getOperand(0);
3901         Inst->eraseFromParent();
3902         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3903       }
3904     }
3905   }
3906 }
3907 
3908 void InnerLoopVectorizer::fixVectorizedLoop() {
3909   // Insert truncates and extends for any truncated instructions as hints to
3910   // InstCombine.
3911   if (VF.isVector())
3912     truncateToMinimalBitwidths();
3913 
3914   // Fix widened non-induction PHIs by setting up the PHI operands.
3915   if (OrigPHIsToFix.size()) {
3916     assert(EnableVPlanNativePath &&
3917            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3918     fixNonInductionPHIs();
3919   }
3920 
3921   // At this point every instruction in the original loop is widened to a
3922   // vector form. Now we need to fix the recurrences in the loop. These PHI
3923   // nodes are currently empty because we did not want to introduce cycles.
3924   // This is the second stage of vectorizing recurrences.
3925   fixCrossIterationPHIs();
3926 
3927   // Forget the original basic block.
3928   PSE.getSE()->forgetLoop(OrigLoop);
3929 
3930   // Fix-up external users of the induction variables.
3931   for (auto &Entry : Legal->getInductionVars())
3932     fixupIVUsers(Entry.first, Entry.second,
3933                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3934                  IVEndValues[Entry.first], LoopMiddleBlock);
3935 
3936   fixLCSSAPHIs();
3937   for (Instruction *PI : PredicatedInstructions)
3938     sinkScalarOperands(&*PI);
3939 
3940   // Remove redundant induction instructions.
3941   cse(LoopVectorBody);
3942 
3943   // Set/update profile weights for the vector and remainder loops as original
3944   // loop iterations are now distributed among them. Note that original loop
3945   // represented by LoopScalarBody becomes remainder loop after vectorization.
3946   //
3947   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3948   // end up getting slightly roughened result but that should be OK since
3949   // profile is not inherently precise anyway. Note also possible bypass of
3950   // vector code caused by legality checks is ignored, assigning all the weight
3951   // to the vector loop, optimistically.
3952   //
3953   // For scalable vectorization we can't know at compile time how many iterations
3954   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3955   // vscale of '1'.
3956   setProfileInfoAfterUnrolling(
3957       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3958       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3959 }
3960 
3961 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3962   // In order to support recurrences we need to be able to vectorize Phi nodes.
3963   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3964   // stage #2: We now need to fix the recurrences by adding incoming edges to
3965   // the currently empty PHI nodes. At this point every instruction in the
3966   // original loop is widened to a vector form so we can use them to construct
3967   // the incoming edges.
3968   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3969     // Handle first-order recurrences and reductions that need to be fixed.
3970     if (Legal->isFirstOrderRecurrence(&Phi))
3971       fixFirstOrderRecurrence(&Phi);
3972     else if (Legal->isReductionVariable(&Phi))
3973       fixReduction(&Phi);
3974   }
3975 }
3976 
3977 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3978   // This is the second phase of vectorizing first-order recurrences. An
3979   // overview of the transformation is described below. Suppose we have the
3980   // following loop.
3981   //
3982   //   for (int i = 0; i < n; ++i)
3983   //     b[i] = a[i] - a[i - 1];
3984   //
3985   // There is a first-order recurrence on "a". For this loop, the shorthand
3986   // scalar IR looks like:
3987   //
3988   //   scalar.ph:
3989   //     s_init = a[-1]
3990   //     br scalar.body
3991   //
3992   //   scalar.body:
3993   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3994   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3995   //     s2 = a[i]
3996   //     b[i] = s2 - s1
3997   //     br cond, scalar.body, ...
3998   //
3999   // In this example, s1 is a recurrence because it's value depends on the
4000   // previous iteration. In the first phase of vectorization, we created a
4001   // temporary value for s1. We now complete the vectorization and produce the
4002   // shorthand vector IR shown below (for VF = 4, UF = 1).
4003   //
4004   //   vector.ph:
4005   //     v_init = vector(..., ..., ..., a[-1])
4006   //     br vector.body
4007   //
4008   //   vector.body
4009   //     i = phi [0, vector.ph], [i+4, vector.body]
4010   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4011   //     v2 = a[i, i+1, i+2, i+3];
4012   //     v3 = vector(v1(3), v2(0, 1, 2))
4013   //     b[i, i+1, i+2, i+3] = v2 - v3
4014   //     br cond, vector.body, middle.block
4015   //
4016   //   middle.block:
4017   //     x = v2(3)
4018   //     br scalar.ph
4019   //
4020   //   scalar.ph:
4021   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4022   //     br scalar.body
4023   //
4024   // After execution completes the vector loop, we extract the next value of
4025   // the recurrence (x) to use as the initial value in the scalar loop.
4026 
4027   // Get the original loop preheader and single loop latch.
4028   auto *Preheader = OrigLoop->getLoopPreheader();
4029   auto *Latch = OrigLoop->getLoopLatch();
4030 
4031   // Get the initial and previous values of the scalar recurrence.
4032   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4033   auto *Previous = Phi->getIncomingValueForBlock(Latch);
4034 
4035   // Create a vector from the initial value.
4036   auto *VectorInit = ScalarInit;
4037   if (VF.isVector()) {
4038     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4039     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4040     VectorInit = Builder.CreateInsertElement(
4041         PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4042         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
4043   }
4044 
4045   // We constructed a temporary phi node in the first phase of vectorization.
4046   // This phi node will eventually be deleted.
4047   Builder.SetInsertPoint(
4048       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4049 
4050   // Create a phi node for the new recurrence. The current value will either be
4051   // the initial value inserted into a vector or loop-varying vector value.
4052   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4053   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4054 
4055   // Get the vectorized previous value of the last part UF - 1. It appears last
4056   // among all unrolled iterations, due to the order of their construction.
4057   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4058 
4059   // Find and set the insertion point after the previous value if it is an
4060   // instruction.
4061   BasicBlock::iterator InsertPt;
4062   // Note that the previous value may have been constant-folded so it is not
4063   // guaranteed to be an instruction in the vector loop.
4064   // FIXME: Loop invariant values do not form recurrences. We should deal with
4065   //        them earlier.
4066   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4067     InsertPt = LoopVectorBody->getFirstInsertionPt();
4068   else {
4069     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4070     if (isa<PHINode>(PreviousLastPart))
4071       // If the previous value is a phi node, we should insert after all the phi
4072       // nodes in the block containing the PHI to avoid breaking basic block
4073       // verification. Note that the basic block may be different to
4074       // LoopVectorBody, in case we predicate the loop.
4075       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4076     else
4077       InsertPt = ++PreviousInst->getIterator();
4078   }
4079   Builder.SetInsertPoint(&*InsertPt);
4080 
4081   // We will construct a vector for the recurrence by combining the values for
4082   // the current and previous iterations. This is the required shuffle mask.
4083   assert(!VF.isScalable());
4084   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4085   ShuffleMask[0] = VF.getKnownMinValue() - 1;
4086   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4087     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4088 
4089   // The vector from which to take the initial value for the current iteration
4090   // (actual or unrolled). Initially, this is the vector phi node.
4091   Value *Incoming = VecPhi;
4092 
4093   // Shuffle the current and previous vector and update the vector parts.
4094   for (unsigned Part = 0; Part < UF; ++Part) {
4095     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4096     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4097     auto *Shuffle =
4098         VF.isVector()
4099             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4100             : Incoming;
4101     PhiPart->replaceAllUsesWith(Shuffle);
4102     cast<Instruction>(PhiPart)->eraseFromParent();
4103     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4104     Incoming = PreviousPart;
4105   }
4106 
4107   // Fix the latch value of the new recurrence in the vector loop.
4108   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4109 
4110   // Extract the last vector element in the middle block. This will be the
4111   // initial value for the recurrence when jumping to the scalar loop.
4112   auto *ExtractForScalar = Incoming;
4113   if (VF.isVector()) {
4114     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4115     ExtractForScalar = Builder.CreateExtractElement(
4116         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4117         "vector.recur.extract");
4118   }
4119   // Extract the second last element in the middle block if the
4120   // Phi is used outside the loop. We need to extract the phi itself
4121   // and not the last element (the phi update in the current iteration). This
4122   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4123   // when the scalar loop is not run at all.
4124   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4125   if (VF.isVector())
4126     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4127         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4128         "vector.recur.extract.for.phi");
4129   // When loop is unrolled without vectorizing, initialize
4130   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4131   // `Incoming`. This is analogous to the vectorized case above: extracting the
4132   // second last element when VF > 1.
4133   else if (UF > 1)
4134     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4135 
4136   // Fix the initial value of the original recurrence in the scalar loop.
4137   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4138   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4139   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4140     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4141     Start->addIncoming(Incoming, BB);
4142   }
4143 
4144   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4145   Phi->setName("scalar.recur");
4146 
4147   // Finally, fix users of the recurrence outside the loop. The users will need
4148   // either the last value of the scalar recurrence or the last value of the
4149   // vector recurrence we extracted in the middle block. Since the loop is in
4150   // LCSSA form, we just need to find all the phi nodes for the original scalar
4151   // recurrence in the exit block, and then add an edge for the middle block.
4152   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4153     if (LCSSAPhi.getIncomingValue(0) == Phi) {
4154       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4155     }
4156   }
4157 }
4158 
4159 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
4160   // Get it's reduction variable descriptor.
4161   assert(Legal->isReductionVariable(Phi) &&
4162          "Unable to find the reduction variable");
4163   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4164 
4165   RecurKind RK = RdxDesc.getRecurrenceKind();
4166   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4167   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4168   setDebugLocFromInst(Builder, ReductionStartValue);
4169   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4170 
4171   // This is the vector-clone of the value that leaves the loop.
4172   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4173 
4174   // Wrap flags are in general invalid after vectorization, clear them.
4175   clearReductionWrapFlags(RdxDesc);
4176 
4177   // Fix the vector-loop phi.
4178 
4179   // Reductions do not have to start at zero. They can start with
4180   // any loop invariant values.
4181   BasicBlock *Latch = OrigLoop->getLoopLatch();
4182   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4183 
4184   for (unsigned Part = 0; Part < UF; ++Part) {
4185     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4186     Value *Val = getOrCreateVectorValue(LoopVal, Part);
4187     cast<PHINode>(VecRdxPhi)
4188       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4189   }
4190 
4191   // Before each round, move the insertion point right between
4192   // the PHIs and the values we are going to write.
4193   // This allows us to write both PHINodes and the extractelement
4194   // instructions.
4195   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4196 
4197   setDebugLocFromInst(Builder, LoopExitInst);
4198 
4199   // If tail is folded by masking, the vector value to leave the loop should be
4200   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4201   // instead of the former. For an inloop reduction the reduction will already
4202   // be predicated, and does not need to be handled here.
4203   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4204     for (unsigned Part = 0; Part < UF; ++Part) {
4205       Value *VecLoopExitInst =
4206           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4207       Value *Sel = nullptr;
4208       for (User *U : VecLoopExitInst->users()) {
4209         if (isa<SelectInst>(U)) {
4210           assert(!Sel && "Reduction exit feeding two selects");
4211           Sel = U;
4212         } else
4213           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4214       }
4215       assert(Sel && "Reduction exit feeds no select");
4216       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4217 
4218       // If the target can create a predicated operator for the reduction at no
4219       // extra cost in the loop (for example a predicated vadd), it can be
4220       // cheaper for the select to remain in the loop than be sunk out of it,
4221       // and so use the select value for the phi instead of the old
4222       // LoopExitValue.
4223       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4224       if (PreferPredicatedReductionSelect ||
4225           TTI->preferPredicatedReductionSelect(
4226               RdxDesc.getOpcode(), Phi->getType(),
4227               TargetTransformInfo::ReductionFlags())) {
4228         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4229         VecRdxPhi->setIncomingValueForBlock(
4230             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4231       }
4232     }
4233   }
4234 
4235   // If the vector reduction can be performed in a smaller type, we truncate
4236   // then extend the loop exit value to enable InstCombine to evaluate the
4237   // entire expression in the smaller type.
4238   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4239     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4240     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4241     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4242     Builder.SetInsertPoint(
4243         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4244     VectorParts RdxParts(UF);
4245     for (unsigned Part = 0; Part < UF; ++Part) {
4246       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4247       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4248       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4249                                         : Builder.CreateZExt(Trunc, VecTy);
4250       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4251            UI != RdxParts[Part]->user_end();)
4252         if (*UI != Trunc) {
4253           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4254           RdxParts[Part] = Extnd;
4255         } else {
4256           ++UI;
4257         }
4258     }
4259     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4260     for (unsigned Part = 0; Part < UF; ++Part) {
4261       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4262       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4263     }
4264   }
4265 
4266   // Reduce all of the unrolled parts into a single vector.
4267   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4268   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4269 
4270   // The middle block terminator has already been assigned a DebugLoc here (the
4271   // OrigLoop's single latch terminator). We want the whole middle block to
4272   // appear to execute on this line because: (a) it is all compiler generated,
4273   // (b) these instructions are always executed after evaluating the latch
4274   // conditional branch, and (c) other passes may add new predecessors which
4275   // terminate on this line. This is the easiest way to ensure we don't
4276   // accidentally cause an extra step back into the loop while debugging.
4277   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4278   for (unsigned Part = 1; Part < UF; ++Part) {
4279     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4280     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4281       // Floating point operations had to be 'fast' to enable the reduction.
4282       ReducedPartRdx = addFastMathFlag(
4283           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4284                               ReducedPartRdx, "bin.rdx"),
4285           RdxDesc.getFastMathFlags());
4286     else
4287       ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4288   }
4289 
4290   // Create the reduction after the loop. Note that inloop reductions create the
4291   // target reduction in the loop using a Reduction recipe.
4292   if (VF.isVector() && !IsInLoopReductionPhi) {
4293     ReducedPartRdx =
4294         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4295     // If the reduction can be performed in a smaller type, we need to extend
4296     // the reduction to the wider type before we branch to the original loop.
4297     if (Phi->getType() != RdxDesc.getRecurrenceType())
4298       ReducedPartRdx =
4299         RdxDesc.isSigned()
4300         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4301         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4302   }
4303 
4304   // Create a phi node that merges control-flow from the backedge-taken check
4305   // block and the middle block.
4306   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4307                                         LoopScalarPreHeader->getTerminator());
4308   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4309     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4310   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4311 
4312   // Now, we need to fix the users of the reduction variable
4313   // inside and outside of the scalar remainder loop.
4314   // We know that the loop is in LCSSA form. We need to update the
4315   // PHI nodes in the exit blocks.
4316   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4317     // All PHINodes need to have a single entry edge, or two if
4318     // we already fixed them.
4319     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4320 
4321     // We found a reduction value exit-PHI. Update it with the
4322     // incoming bypass edge.
4323     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4324       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4325   } // end of the LCSSA phi scan.
4326 
4327     // Fix the scalar loop reduction variable with the incoming reduction sum
4328     // from the vector body and from the backedge value.
4329   int IncomingEdgeBlockIdx =
4330     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4331   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4332   // Pick the other block.
4333   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4334   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4335   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4336 }
4337 
4338 void InnerLoopVectorizer::clearReductionWrapFlags(
4339     RecurrenceDescriptor &RdxDesc) {
4340   RecurKind RK = RdxDesc.getRecurrenceKind();
4341   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4342     return;
4343 
4344   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4345   assert(LoopExitInstr && "null loop exit instruction");
4346   SmallVector<Instruction *, 8> Worklist;
4347   SmallPtrSet<Instruction *, 8> Visited;
4348   Worklist.push_back(LoopExitInstr);
4349   Visited.insert(LoopExitInstr);
4350 
4351   while (!Worklist.empty()) {
4352     Instruction *Cur = Worklist.pop_back_val();
4353     if (isa<OverflowingBinaryOperator>(Cur))
4354       for (unsigned Part = 0; Part < UF; ++Part) {
4355         Value *V = getOrCreateVectorValue(Cur, Part);
4356         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4357       }
4358 
4359     for (User *U : Cur->users()) {
4360       Instruction *UI = cast<Instruction>(U);
4361       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4362           Visited.insert(UI).second)
4363         Worklist.push_back(UI);
4364     }
4365   }
4366 }
4367 
4368 void InnerLoopVectorizer::fixLCSSAPHIs() {
4369   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4370     if (LCSSAPhi.getNumIncomingValues() == 1) {
4371       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4372       // Non-instruction incoming values will have only one value.
4373       unsigned LastLane = 0;
4374       if (isa<Instruction>(IncomingValue))
4375         LastLane = Cost->isUniformAfterVectorization(
4376                        cast<Instruction>(IncomingValue), VF)
4377                        ? 0
4378                        : VF.getKnownMinValue() - 1;
4379       assert((!VF.isScalable() || LastLane == 0) &&
4380              "scalable vectors dont support non-uniform scalars yet");
4381       // Can be a loop invariant incoming value or the last scalar value to be
4382       // extracted from the vectorized loop.
4383       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4384       Value *lastIncomingValue =
4385           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4386       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4387     }
4388   }
4389 }
4390 
4391 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4392   // The basic block and loop containing the predicated instruction.
4393   auto *PredBB = PredInst->getParent();
4394   auto *VectorLoop = LI->getLoopFor(PredBB);
4395 
4396   // Initialize a worklist with the operands of the predicated instruction.
4397   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4398 
4399   // Holds instructions that we need to analyze again. An instruction may be
4400   // reanalyzed if we don't yet know if we can sink it or not.
4401   SmallVector<Instruction *, 8> InstsToReanalyze;
4402 
4403   // Returns true if a given use occurs in the predicated block. Phi nodes use
4404   // their operands in their corresponding predecessor blocks.
4405   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4406     auto *I = cast<Instruction>(U.getUser());
4407     BasicBlock *BB = I->getParent();
4408     if (auto *Phi = dyn_cast<PHINode>(I))
4409       BB = Phi->getIncomingBlock(
4410           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4411     return BB == PredBB;
4412   };
4413 
4414   // Iteratively sink the scalarized operands of the predicated instruction
4415   // into the block we created for it. When an instruction is sunk, it's
4416   // operands are then added to the worklist. The algorithm ends after one pass
4417   // through the worklist doesn't sink a single instruction.
4418   bool Changed;
4419   do {
4420     // Add the instructions that need to be reanalyzed to the worklist, and
4421     // reset the changed indicator.
4422     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4423     InstsToReanalyze.clear();
4424     Changed = false;
4425 
4426     while (!Worklist.empty()) {
4427       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4428 
4429       // We can't sink an instruction if it is a phi node, is already in the
4430       // predicated block, is not in the loop, or may have side effects.
4431       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4432           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4433         continue;
4434 
4435       // It's legal to sink the instruction if all its uses occur in the
4436       // predicated block. Otherwise, there's nothing to do yet, and we may
4437       // need to reanalyze the instruction.
4438       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4439         InstsToReanalyze.push_back(I);
4440         continue;
4441       }
4442 
4443       // Move the instruction to the beginning of the predicated block, and add
4444       // it's operands to the worklist.
4445       I->moveBefore(&*PredBB->getFirstInsertionPt());
4446       Worklist.insert(I->op_begin(), I->op_end());
4447 
4448       // The sinking may have enabled other instructions to be sunk, so we will
4449       // need to iterate.
4450       Changed = true;
4451     }
4452   } while (Changed);
4453 }
4454 
4455 void InnerLoopVectorizer::fixNonInductionPHIs() {
4456   for (PHINode *OrigPhi : OrigPHIsToFix) {
4457     PHINode *NewPhi =
4458         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4459     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4460 
4461     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4462         predecessors(OrigPhi->getParent()));
4463     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4464         predecessors(NewPhi->getParent()));
4465     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4466            "Scalar and Vector BB should have the same number of predecessors");
4467 
4468     // The insertion point in Builder may be invalidated by the time we get
4469     // here. Force the Builder insertion point to something valid so that we do
4470     // not run into issues during insertion point restore in
4471     // getOrCreateVectorValue calls below.
4472     Builder.SetInsertPoint(NewPhi);
4473 
4474     // The predecessor order is preserved and we can rely on mapping between
4475     // scalar and vector block predecessors.
4476     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4477       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4478 
4479       // When looking up the new scalar/vector values to fix up, use incoming
4480       // values from original phi.
4481       Value *ScIncV =
4482           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4483 
4484       // Scalar incoming value may need a broadcast
4485       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4486       NewPhi->addIncoming(NewIncV, NewPredBB);
4487     }
4488   }
4489 }
4490 
4491 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4492                                    VPUser &Operands, unsigned UF,
4493                                    ElementCount VF, bool IsPtrLoopInvariant,
4494                                    SmallBitVector &IsIndexLoopInvariant,
4495                                    VPTransformState &State) {
4496   // Construct a vector GEP by widening the operands of the scalar GEP as
4497   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4498   // results in a vector of pointers when at least one operand of the GEP
4499   // is vector-typed. Thus, to keep the representation compact, we only use
4500   // vector-typed operands for loop-varying values.
4501 
4502   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4503     // If we are vectorizing, but the GEP has only loop-invariant operands,
4504     // the GEP we build (by only using vector-typed operands for
4505     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4506     // produce a vector of pointers, we need to either arbitrarily pick an
4507     // operand to broadcast, or broadcast a clone of the original GEP.
4508     // Here, we broadcast a clone of the original.
4509     //
4510     // TODO: If at some point we decide to scalarize instructions having
4511     //       loop-invariant operands, this special case will no longer be
4512     //       required. We would add the scalarization decision to
4513     //       collectLoopScalars() and teach getVectorValue() to broadcast
4514     //       the lane-zero scalar value.
4515     auto *Clone = Builder.Insert(GEP->clone());
4516     for (unsigned Part = 0; Part < UF; ++Part) {
4517       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4518       State.set(VPDef, GEP, EntryPart, Part);
4519       addMetadata(EntryPart, GEP);
4520     }
4521   } else {
4522     // If the GEP has at least one loop-varying operand, we are sure to
4523     // produce a vector of pointers. But if we are only unrolling, we want
4524     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4525     // produce with the code below will be scalar (if VF == 1) or vector
4526     // (otherwise). Note that for the unroll-only case, we still maintain
4527     // values in the vector mapping with initVector, as we do for other
4528     // instructions.
4529     for (unsigned Part = 0; Part < UF; ++Part) {
4530       // The pointer operand of the new GEP. If it's loop-invariant, we
4531       // won't broadcast it.
4532       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4533                                      : State.get(Operands.getOperand(0), Part);
4534 
4535       // Collect all the indices for the new GEP. If any index is
4536       // loop-invariant, we won't broadcast it.
4537       SmallVector<Value *, 4> Indices;
4538       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4539         VPValue *Operand = Operands.getOperand(I);
4540         if (IsIndexLoopInvariant[I - 1])
4541           Indices.push_back(State.get(Operand, {0, 0}));
4542         else
4543           Indices.push_back(State.get(Operand, Part));
4544       }
4545 
4546       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4547       // but it should be a vector, otherwise.
4548       auto *NewGEP =
4549           GEP->isInBounds()
4550               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4551                                           Indices)
4552               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4553       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4554              "NewGEP is not a pointer vector");
4555       State.set(VPDef, GEP, NewGEP, Part);
4556       addMetadata(NewGEP, GEP);
4557     }
4558   }
4559 }
4560 
4561 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4562                                               RecurrenceDescriptor *RdxDesc,
4563                                               Value *StartV, unsigned UF,
4564                                               ElementCount VF) {
4565   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4566   PHINode *P = cast<PHINode>(PN);
4567   if (EnableVPlanNativePath) {
4568     // Currently we enter here in the VPlan-native path for non-induction
4569     // PHIs where all control flow is uniform. We simply widen these PHIs.
4570     // Create a vector phi with no operands - the vector phi operands will be
4571     // set at the end of vector code generation.
4572     Type *VecTy =
4573         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4574     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4575     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4576     OrigPHIsToFix.push_back(P);
4577 
4578     return;
4579   }
4580 
4581   assert(PN->getParent() == OrigLoop->getHeader() &&
4582          "Non-header phis should have been handled elsewhere");
4583 
4584   // In order to support recurrences we need to be able to vectorize Phi nodes.
4585   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4586   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4587   // this value when we vectorize all of the instructions that use the PHI.
4588   if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
4589     Value *Iden = nullptr;
4590     bool ScalarPHI =
4591         (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4592     Type *VecTy =
4593         ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4594 
4595     if (RdxDesc) {
4596       assert(Legal->isReductionVariable(P) && StartV &&
4597              "RdxDesc should only be set for reduction variables; in that case "
4598              "a StartV is also required");
4599       RecurKind RK = RdxDesc->getRecurrenceKind();
4600       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
4601         // MinMax reduction have the start value as their identify.
4602         if (ScalarPHI) {
4603           Iden = StartV;
4604         } else {
4605           IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4606           Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4607           StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident");
4608         }
4609       } else {
4610         Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
4611             RK, VecTy->getScalarType());
4612         Iden = IdenC;
4613 
4614         if (!ScalarPHI) {
4615           Iden = ConstantVector::getSplat(VF, IdenC);
4616           IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4617           Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4618           Constant *Zero = Builder.getInt32(0);
4619           StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
4620         }
4621       }
4622     }
4623 
4624     for (unsigned Part = 0; Part < UF; ++Part) {
4625       // This is phase one of vectorizing PHIs.
4626       Value *EntryPart = PHINode::Create(
4627           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4628       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4629       if (StartV) {
4630         // Make sure to add the reduction start value only to the
4631         // first unroll part.
4632         Value *StartVal = (Part == 0) ? StartV : Iden;
4633         cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
4634       }
4635     }
4636     return;
4637   }
4638 
4639   assert(!Legal->isReductionVariable(P) &&
4640          "reductions should be handled above");
4641 
4642   setDebugLocFromInst(Builder, P);
4643 
4644   // This PHINode must be an induction variable.
4645   // Make sure that we know about it.
4646   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4647 
4648   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4649   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4650 
4651   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4652   // which can be found from the original scalar operations.
4653   switch (II.getKind()) {
4654   case InductionDescriptor::IK_NoInduction:
4655     llvm_unreachable("Unknown induction");
4656   case InductionDescriptor::IK_IntInduction:
4657   case InductionDescriptor::IK_FpInduction:
4658     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4659   case InductionDescriptor::IK_PtrInduction: {
4660     // Handle the pointer induction variable case.
4661     assert(P->getType()->isPointerTy() && "Unexpected type.");
4662 
4663     if (Cost->isScalarAfterVectorization(P, VF)) {
4664       // This is the normalized GEP that starts counting at zero.
4665       Value *PtrInd =
4666           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4667       // Determine the number of scalars we need to generate for each unroll
4668       // iteration. If the instruction is uniform, we only need to generate the
4669       // first lane. Otherwise, we generate all VF values.
4670       unsigned Lanes =
4671           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4672       for (unsigned Part = 0; Part < UF; ++Part) {
4673         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4674           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4675                                            Lane + Part * VF.getKnownMinValue());
4676           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4677           Value *SclrGep =
4678               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4679           SclrGep->setName("next.gep");
4680           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4681         }
4682       }
4683       return;
4684     }
4685     assert(isa<SCEVConstant>(II.getStep()) &&
4686            "Induction step not a SCEV constant!");
4687     Type *PhiType = II.getStep()->getType();
4688 
4689     // Build a pointer phi
4690     Value *ScalarStartValue = II.getStartValue();
4691     Type *ScStValueType = ScalarStartValue->getType();
4692     PHINode *NewPointerPhi =
4693         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4694     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4695 
4696     // A pointer induction, performed by using a gep
4697     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4698     Instruction *InductionLoc = LoopLatch->getTerminator();
4699     const SCEV *ScalarStep = II.getStep();
4700     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4701     Value *ScalarStepValue =
4702         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4703     Value *InductionGEP = GetElementPtrInst::Create(
4704         ScStValueType->getPointerElementType(), NewPointerPhi,
4705         Builder.CreateMul(
4706             ScalarStepValue,
4707             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4708         "ptr.ind", InductionLoc);
4709     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4710 
4711     // Create UF many actual address geps that use the pointer
4712     // phi as base and a vectorized version of the step value
4713     // (<step*0, ..., step*N>) as offset.
4714     for (unsigned Part = 0; Part < UF; ++Part) {
4715       SmallVector<Constant *, 8> Indices;
4716       // Create a vector of consecutive numbers from zero to VF.
4717       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4718         Indices.push_back(
4719             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4720       Constant *StartOffset = ConstantVector::get(Indices);
4721 
4722       Value *GEP = Builder.CreateGEP(
4723           ScStValueType->getPointerElementType(), NewPointerPhi,
4724           Builder.CreateMul(
4725               StartOffset,
4726               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4727               "vector.gep"));
4728       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4729     }
4730   }
4731   }
4732 }
4733 
4734 /// A helper function for checking whether an integer division-related
4735 /// instruction may divide by zero (in which case it must be predicated if
4736 /// executed conditionally in the scalar code).
4737 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4738 /// Non-zero divisors that are non compile-time constants will not be
4739 /// converted into multiplication, so we will still end up scalarizing
4740 /// the division, but can do so w/o predication.
4741 static bool mayDivideByZero(Instruction &I) {
4742   assert((I.getOpcode() == Instruction::UDiv ||
4743           I.getOpcode() == Instruction::SDiv ||
4744           I.getOpcode() == Instruction::URem ||
4745           I.getOpcode() == Instruction::SRem) &&
4746          "Unexpected instruction");
4747   Value *Divisor = I.getOperand(1);
4748   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4749   return !CInt || CInt->isZero();
4750 }
4751 
4752 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4753                                            VPUser &User,
4754                                            VPTransformState &State) {
4755   switch (I.getOpcode()) {
4756   case Instruction::Call:
4757   case Instruction::Br:
4758   case Instruction::PHI:
4759   case Instruction::GetElementPtr:
4760   case Instruction::Select:
4761     llvm_unreachable("This instruction is handled by a different recipe.");
4762   case Instruction::UDiv:
4763   case Instruction::SDiv:
4764   case Instruction::SRem:
4765   case Instruction::URem:
4766   case Instruction::Add:
4767   case Instruction::FAdd:
4768   case Instruction::Sub:
4769   case Instruction::FSub:
4770   case Instruction::FNeg:
4771   case Instruction::Mul:
4772   case Instruction::FMul:
4773   case Instruction::FDiv:
4774   case Instruction::FRem:
4775   case Instruction::Shl:
4776   case Instruction::LShr:
4777   case Instruction::AShr:
4778   case Instruction::And:
4779   case Instruction::Or:
4780   case Instruction::Xor: {
4781     // Just widen unops and binops.
4782     setDebugLocFromInst(Builder, &I);
4783 
4784     for (unsigned Part = 0; Part < UF; ++Part) {
4785       SmallVector<Value *, 2> Ops;
4786       for (VPValue *VPOp : User.operands())
4787         Ops.push_back(State.get(VPOp, Part));
4788 
4789       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4790 
4791       if (auto *VecOp = dyn_cast<Instruction>(V))
4792         VecOp->copyIRFlags(&I);
4793 
4794       // Use this vector value for all users of the original instruction.
4795       State.set(Def, &I, V, Part);
4796       addMetadata(V, &I);
4797     }
4798 
4799     break;
4800   }
4801   case Instruction::ICmp:
4802   case Instruction::FCmp: {
4803     // Widen compares. Generate vector compares.
4804     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4805     auto *Cmp = cast<CmpInst>(&I);
4806     setDebugLocFromInst(Builder, Cmp);
4807     for (unsigned Part = 0; Part < UF; ++Part) {
4808       Value *A = State.get(User.getOperand(0), Part);
4809       Value *B = State.get(User.getOperand(1), Part);
4810       Value *C = nullptr;
4811       if (FCmp) {
4812         // Propagate fast math flags.
4813         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4814         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4815         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4816       } else {
4817         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4818       }
4819       State.set(Def, &I, C, Part);
4820       addMetadata(C, &I);
4821     }
4822 
4823     break;
4824   }
4825 
4826   case Instruction::ZExt:
4827   case Instruction::SExt:
4828   case Instruction::FPToUI:
4829   case Instruction::FPToSI:
4830   case Instruction::FPExt:
4831   case Instruction::PtrToInt:
4832   case Instruction::IntToPtr:
4833   case Instruction::SIToFP:
4834   case Instruction::UIToFP:
4835   case Instruction::Trunc:
4836   case Instruction::FPTrunc:
4837   case Instruction::BitCast: {
4838     auto *CI = cast<CastInst>(&I);
4839     setDebugLocFromInst(Builder, CI);
4840 
4841     /// Vectorize casts.
4842     Type *DestTy =
4843         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4844 
4845     for (unsigned Part = 0; Part < UF; ++Part) {
4846       Value *A = State.get(User.getOperand(0), Part);
4847       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4848       State.set(Def, &I, Cast, Part);
4849       addMetadata(Cast, &I);
4850     }
4851     break;
4852   }
4853   default:
4854     // This instruction is not vectorized by simple widening.
4855     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4856     llvm_unreachable("Unhandled instruction!");
4857   } // end of switch.
4858 }
4859 
4860 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4861                                                VPUser &ArgOperands,
4862                                                VPTransformState &State) {
4863   assert(!isa<DbgInfoIntrinsic>(I) &&
4864          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4865   setDebugLocFromInst(Builder, &I);
4866 
4867   Module *M = I.getParent()->getParent()->getParent();
4868   auto *CI = cast<CallInst>(&I);
4869 
4870   SmallVector<Type *, 4> Tys;
4871   for (Value *ArgOperand : CI->arg_operands())
4872     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4873 
4874   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4875 
4876   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4877   // version of the instruction.
4878   // Is it beneficial to perform intrinsic call compared to lib call?
4879   bool NeedToScalarize = false;
4880   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4881   bool UseVectorIntrinsic =
4882       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4883   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4884          "Instruction should be scalarized elsewhere.");
4885 
4886   for (unsigned Part = 0; Part < UF; ++Part) {
4887     SmallVector<Value *, 4> Args;
4888     for (auto &I : enumerate(ArgOperands.operands())) {
4889       // Some intrinsics have a scalar argument - don't replace it with a
4890       // vector.
4891       Value *Arg;
4892       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4893         Arg = State.get(I.value(), Part);
4894       else
4895         Arg = State.get(I.value(), {0, 0});
4896       Args.push_back(Arg);
4897     }
4898 
4899     Function *VectorF;
4900     if (UseVectorIntrinsic) {
4901       // Use vector version of the intrinsic.
4902       Type *TysForDecl[] = {CI->getType()};
4903       if (VF.isVector()) {
4904         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4905         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4906       }
4907       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4908       assert(VectorF && "Can't retrieve vector intrinsic.");
4909     } else {
4910       // Use vector version of the function call.
4911       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4912 #ifndef NDEBUG
4913       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4914              "Can't create vector function.");
4915 #endif
4916         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4917     }
4918       SmallVector<OperandBundleDef, 1> OpBundles;
4919       CI->getOperandBundlesAsDefs(OpBundles);
4920       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4921 
4922       if (isa<FPMathOperator>(V))
4923         V->copyFastMathFlags(CI);
4924 
4925       State.set(Def, &I, V, Part);
4926       addMetadata(V, &I);
4927   }
4928 }
4929 
4930 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4931                                                  VPUser &Operands,
4932                                                  bool InvariantCond,
4933                                                  VPTransformState &State) {
4934   setDebugLocFromInst(Builder, &I);
4935 
4936   // The condition can be loop invariant  but still defined inside the
4937   // loop. This means that we can't just use the original 'cond' value.
4938   // We have to take the 'vectorized' value and pick the first lane.
4939   // Instcombine will make this a no-op.
4940   auto *InvarCond =
4941       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4942 
4943   for (unsigned Part = 0; Part < UF; ++Part) {
4944     Value *Cond =
4945         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4946     Value *Op0 = State.get(Operands.getOperand(1), Part);
4947     Value *Op1 = State.get(Operands.getOperand(2), Part);
4948     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4949     State.set(VPDef, &I, Sel, Part);
4950     addMetadata(Sel, &I);
4951   }
4952 }
4953 
4954 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4955   // We should not collect Scalars more than once per VF. Right now, this
4956   // function is called from collectUniformsAndScalars(), which already does
4957   // this check. Collecting Scalars for VF=1 does not make any sense.
4958   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4959          "This function should not be visited twice for the same VF");
4960 
4961   SmallSetVector<Instruction *, 8> Worklist;
4962 
4963   // These sets are used to seed the analysis with pointers used by memory
4964   // accesses that will remain scalar.
4965   SmallSetVector<Instruction *, 8> ScalarPtrs;
4966   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4967   auto *Latch = TheLoop->getLoopLatch();
4968 
4969   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4970   // The pointer operands of loads and stores will be scalar as long as the
4971   // memory access is not a gather or scatter operation. The value operand of a
4972   // store will remain scalar if the store is scalarized.
4973   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4974     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4975     assert(WideningDecision != CM_Unknown &&
4976            "Widening decision should be ready at this moment");
4977     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4978       if (Ptr == Store->getValueOperand())
4979         return WideningDecision == CM_Scalarize;
4980     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4981            "Ptr is neither a value or pointer operand");
4982     return WideningDecision != CM_GatherScatter;
4983   };
4984 
4985   // A helper that returns true if the given value is a bitcast or
4986   // getelementptr instruction contained in the loop.
4987   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4988     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4989             isa<GetElementPtrInst>(V)) &&
4990            !TheLoop->isLoopInvariant(V);
4991   };
4992 
4993   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4994     if (!isa<PHINode>(Ptr) ||
4995         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4996       return false;
4997     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4998     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4999       return false;
5000     return isScalarUse(MemAccess, Ptr);
5001   };
5002 
5003   // A helper that evaluates a memory access's use of a pointer. If the
5004   // pointer is actually the pointer induction of a loop, it is being
5005   // inserted into Worklist. If the use will be a scalar use, and the
5006   // pointer is only used by memory accesses, we place the pointer in
5007   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5008   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5009     if (isScalarPtrInduction(MemAccess, Ptr)) {
5010       Worklist.insert(cast<Instruction>(Ptr));
5011       Instruction *Update = cast<Instruction>(
5012           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5013       Worklist.insert(Update);
5014       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
5015                         << "\n");
5016       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
5017                         << "\n");
5018       return;
5019     }
5020     // We only care about bitcast and getelementptr instructions contained in
5021     // the loop.
5022     if (!isLoopVaryingBitCastOrGEP(Ptr))
5023       return;
5024 
5025     // If the pointer has already been identified as scalar (e.g., if it was
5026     // also identified as uniform), there's nothing to do.
5027     auto *I = cast<Instruction>(Ptr);
5028     if (Worklist.count(I))
5029       return;
5030 
5031     // If the use of the pointer will be a scalar use, and all users of the
5032     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5033     // place the pointer in PossibleNonScalarPtrs.
5034     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5035           return isa<LoadInst>(U) || isa<StoreInst>(U);
5036         }))
5037       ScalarPtrs.insert(I);
5038     else
5039       PossibleNonScalarPtrs.insert(I);
5040   };
5041 
5042   // We seed the scalars analysis with three classes of instructions: (1)
5043   // instructions marked uniform-after-vectorization and (2) bitcast,
5044   // getelementptr and (pointer) phi instructions used by memory accesses
5045   // requiring a scalar use.
5046   //
5047   // (1) Add to the worklist all instructions that have been identified as
5048   // uniform-after-vectorization.
5049   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5050 
5051   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5052   // memory accesses requiring a scalar use. The pointer operands of loads and
5053   // stores will be scalar as long as the memory accesses is not a gather or
5054   // scatter operation. The value operand of a store will remain scalar if the
5055   // store is scalarized.
5056   for (auto *BB : TheLoop->blocks())
5057     for (auto &I : *BB) {
5058       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5059         evaluatePtrUse(Load, Load->getPointerOperand());
5060       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5061         evaluatePtrUse(Store, Store->getPointerOperand());
5062         evaluatePtrUse(Store, Store->getValueOperand());
5063       }
5064     }
5065   for (auto *I : ScalarPtrs)
5066     if (!PossibleNonScalarPtrs.count(I)) {
5067       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5068       Worklist.insert(I);
5069     }
5070 
5071   // Insert the forced scalars.
5072   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5073   // induction variable when the PHI user is scalarized.
5074   auto ForcedScalar = ForcedScalars.find(VF);
5075   if (ForcedScalar != ForcedScalars.end())
5076     for (auto *I : ForcedScalar->second)
5077       Worklist.insert(I);
5078 
5079   // Expand the worklist by looking through any bitcasts and getelementptr
5080   // instructions we've already identified as scalar. This is similar to the
5081   // expansion step in collectLoopUniforms(); however, here we're only
5082   // expanding to include additional bitcasts and getelementptr instructions.
5083   unsigned Idx = 0;
5084   while (Idx != Worklist.size()) {
5085     Instruction *Dst = Worklist[Idx++];
5086     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5087       continue;
5088     auto *Src = cast<Instruction>(Dst->getOperand(0));
5089     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5090           auto *J = cast<Instruction>(U);
5091           return !TheLoop->contains(J) || Worklist.count(J) ||
5092                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5093                   isScalarUse(J, Src));
5094         })) {
5095       Worklist.insert(Src);
5096       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5097     }
5098   }
5099 
5100   // An induction variable will remain scalar if all users of the induction
5101   // variable and induction variable update remain scalar.
5102   for (auto &Induction : Legal->getInductionVars()) {
5103     auto *Ind = Induction.first;
5104     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5105 
5106     // If tail-folding is applied, the primary induction variable will be used
5107     // to feed a vector compare.
5108     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5109       continue;
5110 
5111     // Determine if all users of the induction variable are scalar after
5112     // vectorization.
5113     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5114       auto *I = cast<Instruction>(U);
5115       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5116     });
5117     if (!ScalarInd)
5118       continue;
5119 
5120     // Determine if all users of the induction variable update instruction are
5121     // scalar after vectorization.
5122     auto ScalarIndUpdate =
5123         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5124           auto *I = cast<Instruction>(U);
5125           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5126         });
5127     if (!ScalarIndUpdate)
5128       continue;
5129 
5130     // The induction variable and its update instruction will remain scalar.
5131     Worklist.insert(Ind);
5132     Worklist.insert(IndUpdate);
5133     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5134     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5135                       << "\n");
5136   }
5137 
5138   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5139 }
5140 
5141 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
5142                                                          ElementCount VF) {
5143   if (!blockNeedsPredication(I->getParent()))
5144     return false;
5145   switch(I->getOpcode()) {
5146   default:
5147     break;
5148   case Instruction::Load:
5149   case Instruction::Store: {
5150     if (!Legal->isMaskRequired(I))
5151       return false;
5152     auto *Ptr = getLoadStorePointerOperand(I);
5153     auto *Ty = getMemInstValueType(I);
5154     // We have already decided how to vectorize this instruction, get that
5155     // result.
5156     if (VF.isVector()) {
5157       InstWidening WideningDecision = getWideningDecision(I, VF);
5158       assert(WideningDecision != CM_Unknown &&
5159              "Widening decision should be ready at this moment");
5160       return WideningDecision == CM_Scalarize;
5161     }
5162     const Align Alignment = getLoadStoreAlignment(I);
5163     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5164                                 isLegalMaskedGather(Ty, Alignment))
5165                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5166                                 isLegalMaskedScatter(Ty, Alignment));
5167   }
5168   case Instruction::UDiv:
5169   case Instruction::SDiv:
5170   case Instruction::SRem:
5171   case Instruction::URem:
5172     return mayDivideByZero(*I);
5173   }
5174   return false;
5175 }
5176 
5177 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5178     Instruction *I, ElementCount VF) {
5179   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5180   assert(getWideningDecision(I, VF) == CM_Unknown &&
5181          "Decision should not be set yet.");
5182   auto *Group = getInterleavedAccessGroup(I);
5183   assert(Group && "Must have a group.");
5184 
5185   // If the instruction's allocated size doesn't equal it's type size, it
5186   // requires padding and will be scalarized.
5187   auto &DL = I->getModule()->getDataLayout();
5188   auto *ScalarTy = getMemInstValueType(I);
5189   if (hasIrregularType(ScalarTy, DL, VF))
5190     return false;
5191 
5192   // Check if masking is required.
5193   // A Group may need masking for one of two reasons: it resides in a block that
5194   // needs predication, or it was decided to use masking to deal with gaps.
5195   bool PredicatedAccessRequiresMasking =
5196       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5197   bool AccessWithGapsRequiresMasking =
5198       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5199   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5200     return true;
5201 
5202   // If masked interleaving is required, we expect that the user/target had
5203   // enabled it, because otherwise it either wouldn't have been created or
5204   // it should have been invalidated by the CostModel.
5205   assert(useMaskedInterleavedAccesses(TTI) &&
5206          "Masked interleave-groups for predicated accesses are not enabled.");
5207 
5208   auto *Ty = getMemInstValueType(I);
5209   const Align Alignment = getLoadStoreAlignment(I);
5210   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5211                           : TTI.isLegalMaskedStore(Ty, Alignment);
5212 }
5213 
5214 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5215     Instruction *I, ElementCount VF) {
5216   // Get and ensure we have a valid memory instruction.
5217   LoadInst *LI = dyn_cast<LoadInst>(I);
5218   StoreInst *SI = dyn_cast<StoreInst>(I);
5219   assert((LI || SI) && "Invalid memory instruction");
5220 
5221   auto *Ptr = getLoadStorePointerOperand(I);
5222 
5223   // In order to be widened, the pointer should be consecutive, first of all.
5224   if (!Legal->isConsecutivePtr(Ptr))
5225     return false;
5226 
5227   // If the instruction is a store located in a predicated block, it will be
5228   // scalarized.
5229   if (isScalarWithPredication(I))
5230     return false;
5231 
5232   // If the instruction's allocated size doesn't equal it's type size, it
5233   // requires padding and will be scalarized.
5234   auto &DL = I->getModule()->getDataLayout();
5235   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5236   if (hasIrregularType(ScalarTy, DL, VF))
5237     return false;
5238 
5239   return true;
5240 }
5241 
5242 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5243   // We should not collect Uniforms more than once per VF. Right now,
5244   // this function is called from collectUniformsAndScalars(), which
5245   // already does this check. Collecting Uniforms for VF=1 does not make any
5246   // sense.
5247 
5248   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5249          "This function should not be visited twice for the same VF");
5250 
5251   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5252   // not analyze again.  Uniforms.count(VF) will return 1.
5253   Uniforms[VF].clear();
5254 
5255   // We now know that the loop is vectorizable!
5256   // Collect instructions inside the loop that will remain uniform after
5257   // vectorization.
5258 
5259   // Global values, params and instructions outside of current loop are out of
5260   // scope.
5261   auto isOutOfScope = [&](Value *V) -> bool {
5262     Instruction *I = dyn_cast<Instruction>(V);
5263     return (!I || !TheLoop->contains(I));
5264   };
5265 
5266   SetVector<Instruction *> Worklist;
5267   BasicBlock *Latch = TheLoop->getLoopLatch();
5268 
5269   // Instructions that are scalar with predication must not be considered
5270   // uniform after vectorization, because that would create an erroneous
5271   // replicating region where only a single instance out of VF should be formed.
5272   // TODO: optimize such seldom cases if found important, see PR40816.
5273   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5274     if (isOutOfScope(I)) {
5275       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5276                         << *I << "\n");
5277       return;
5278     }
5279     if (isScalarWithPredication(I, VF)) {
5280       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5281                         << *I << "\n");
5282       return;
5283     }
5284     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5285     Worklist.insert(I);
5286   };
5287 
5288   // Start with the conditional branch. If the branch condition is an
5289   // instruction contained in the loop that is only used by the branch, it is
5290   // uniform.
5291   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5292   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5293     addToWorklistIfAllowed(Cmp);
5294 
5295   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5296     InstWidening WideningDecision = getWideningDecision(I, VF);
5297     assert(WideningDecision != CM_Unknown &&
5298            "Widening decision should be ready at this moment");
5299 
5300     // A uniform memory op is itself uniform.  We exclude uniform stores
5301     // here as they demand the last lane, not the first one.
5302     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5303       assert(WideningDecision == CM_Scalarize);
5304       return true;
5305     }
5306 
5307     return (WideningDecision == CM_Widen ||
5308             WideningDecision == CM_Widen_Reverse ||
5309             WideningDecision == CM_Interleave);
5310   };
5311 
5312 
5313   // Returns true if Ptr is the pointer operand of a memory access instruction
5314   // I, and I is known to not require scalarization.
5315   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5316     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5317   };
5318 
5319   // Holds a list of values which are known to have at least one uniform use.
5320   // Note that there may be other uses which aren't uniform.  A "uniform use"
5321   // here is something which only demands lane 0 of the unrolled iterations;
5322   // it does not imply that all lanes produce the same value (e.g. this is not
5323   // the usual meaning of uniform)
5324   SmallPtrSet<Value *, 8> HasUniformUse;
5325 
5326   // Scan the loop for instructions which are either a) known to have only
5327   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5328   for (auto *BB : TheLoop->blocks())
5329     for (auto &I : *BB) {
5330       // If there's no pointer operand, there's nothing to do.
5331       auto *Ptr = getLoadStorePointerOperand(&I);
5332       if (!Ptr)
5333         continue;
5334 
5335       // A uniform memory op is itself uniform.  We exclude uniform stores
5336       // here as they demand the last lane, not the first one.
5337       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5338         addToWorklistIfAllowed(&I);
5339 
5340       if (isUniformDecision(&I, VF)) {
5341         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5342         HasUniformUse.insert(Ptr);
5343       }
5344     }
5345 
5346   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5347   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5348   // disallows uses outside the loop as well.
5349   for (auto *V : HasUniformUse) {
5350     if (isOutOfScope(V))
5351       continue;
5352     auto *I = cast<Instruction>(V);
5353     auto UsersAreMemAccesses =
5354       llvm::all_of(I->users(), [&](User *U) -> bool {
5355         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5356       });
5357     if (UsersAreMemAccesses)
5358       addToWorklistIfAllowed(I);
5359   }
5360 
5361   // Expand Worklist in topological order: whenever a new instruction
5362   // is added , its users should be already inside Worklist.  It ensures
5363   // a uniform instruction will only be used by uniform instructions.
5364   unsigned idx = 0;
5365   while (idx != Worklist.size()) {
5366     Instruction *I = Worklist[idx++];
5367 
5368     for (auto OV : I->operand_values()) {
5369       // isOutOfScope operands cannot be uniform instructions.
5370       if (isOutOfScope(OV))
5371         continue;
5372       // First order recurrence Phi's should typically be considered
5373       // non-uniform.
5374       auto *OP = dyn_cast<PHINode>(OV);
5375       if (OP && Legal->isFirstOrderRecurrence(OP))
5376         continue;
5377       // If all the users of the operand are uniform, then add the
5378       // operand into the uniform worklist.
5379       auto *OI = cast<Instruction>(OV);
5380       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5381             auto *J = cast<Instruction>(U);
5382             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5383           }))
5384         addToWorklistIfAllowed(OI);
5385     }
5386   }
5387 
5388   // For an instruction to be added into Worklist above, all its users inside
5389   // the loop should also be in Worklist. However, this condition cannot be
5390   // true for phi nodes that form a cyclic dependence. We must process phi
5391   // nodes separately. An induction variable will remain uniform if all users
5392   // of the induction variable and induction variable update remain uniform.
5393   // The code below handles both pointer and non-pointer induction variables.
5394   for (auto &Induction : Legal->getInductionVars()) {
5395     auto *Ind = Induction.first;
5396     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5397 
5398     // Determine if all users of the induction variable are uniform after
5399     // vectorization.
5400     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5401       auto *I = cast<Instruction>(U);
5402       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5403              isVectorizedMemAccessUse(I, Ind);
5404     });
5405     if (!UniformInd)
5406       continue;
5407 
5408     // Determine if all users of the induction variable update instruction are
5409     // uniform after vectorization.
5410     auto UniformIndUpdate =
5411         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5412           auto *I = cast<Instruction>(U);
5413           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5414                  isVectorizedMemAccessUse(I, IndUpdate);
5415         });
5416     if (!UniformIndUpdate)
5417       continue;
5418 
5419     // The induction variable and its update instruction will remain uniform.
5420     addToWorklistIfAllowed(Ind);
5421     addToWorklistIfAllowed(IndUpdate);
5422   }
5423 
5424   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5425 }
5426 
5427 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5428   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5429 
5430   if (Legal->getRuntimePointerChecking()->Need) {
5431     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5432         "runtime pointer checks needed. Enable vectorization of this "
5433         "loop with '#pragma clang loop vectorize(enable)' when "
5434         "compiling with -Os/-Oz",
5435         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5436     return true;
5437   }
5438 
5439   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5440     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5441         "runtime SCEV checks needed. Enable vectorization of this "
5442         "loop with '#pragma clang loop vectorize(enable)' when "
5443         "compiling with -Os/-Oz",
5444         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5445     return true;
5446   }
5447 
5448   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5449   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5450     reportVectorizationFailure("Runtime stride check for small trip count",
5451         "runtime stride == 1 checks needed. Enable vectorization of "
5452         "this loop without such check by compiling with -Os/-Oz",
5453         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5454     return true;
5455   }
5456 
5457   return false;
5458 }
5459 
5460 Optional<ElementCount>
5461 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5462   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5463     // TODO: It may by useful to do since it's still likely to be dynamically
5464     // uniform if the target can skip.
5465     reportVectorizationFailure(
5466         "Not inserting runtime ptr check for divergent target",
5467         "runtime pointer checks needed. Not enabled for divergent target",
5468         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5469     return None;
5470   }
5471 
5472   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5473   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5474   if (TC == 1) {
5475     reportVectorizationFailure("Single iteration (non) loop",
5476         "loop trip count is one, irrelevant for vectorization",
5477         "SingleIterationLoop", ORE, TheLoop);
5478     return None;
5479   }
5480 
5481   ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5482 
5483   switch (ScalarEpilogueStatus) {
5484   case CM_ScalarEpilogueAllowed:
5485     return MaxVF;
5486   case CM_ScalarEpilogueNotAllowedUsePredicate:
5487     LLVM_FALLTHROUGH;
5488   case CM_ScalarEpilogueNotNeededUsePredicate:
5489     LLVM_DEBUG(
5490         dbgs() << "LV: vector predicate hint/switch found.\n"
5491                << "LV: Not allowing scalar epilogue, creating predicated "
5492                << "vector loop.\n");
5493     break;
5494   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5495     // fallthrough as a special case of OptForSize
5496   case CM_ScalarEpilogueNotAllowedOptSize:
5497     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5498       LLVM_DEBUG(
5499           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5500     else
5501       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5502                         << "count.\n");
5503 
5504     // Bail if runtime checks are required, which are not good when optimising
5505     // for size.
5506     if (runtimeChecksRequired())
5507       return None;
5508 
5509     break;
5510   }
5511 
5512   // The only loops we can vectorize without a scalar epilogue, are loops with
5513   // a bottom-test and a single exiting block. We'd have to handle the fact
5514   // that not every instruction executes on the last iteration.  This will
5515   // require a lane mask which varies through the vector loop body.  (TODO)
5516   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5517     // If there was a tail-folding hint/switch, but we can't fold the tail by
5518     // masking, fallback to a vectorization with a scalar epilogue.
5519     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5520       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5521                            "scalar epilogue instead.\n");
5522       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5523       return MaxVF;
5524     }
5525     return None;
5526   }
5527 
5528   // Now try the tail folding
5529 
5530   // Invalidate interleave groups that require an epilogue if we can't mask
5531   // the interleave-group.
5532   if (!useMaskedInterleavedAccesses(TTI)) {
5533     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5534            "No decisions should have been taken at this point");
5535     // Note: There is no need to invalidate any cost modeling decisions here, as
5536     // non where taken so far.
5537     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5538   }
5539 
5540   assert(!MaxVF.isScalable() &&
5541          "Scalable vectors do not yet support tail folding");
5542   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5543          "MaxVF must be a power of 2");
5544   unsigned MaxVFtimesIC =
5545       UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5546   // Avoid tail folding if the trip count is known to be a multiple of any VF we
5547   // chose.
5548   ScalarEvolution *SE = PSE.getSE();
5549   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5550   const SCEV *ExitCount = SE->getAddExpr(
5551       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5552   const SCEV *Rem = SE->getURemExpr(
5553       ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5554   if (Rem->isZero()) {
5555     // Accept MaxVF if we do not have a tail.
5556     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5557     return MaxVF;
5558   }
5559 
5560   // If we don't know the precise trip count, or if the trip count that we
5561   // found modulo the vectorization factor is not zero, try to fold the tail
5562   // by masking.
5563   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5564   if (Legal->prepareToFoldTailByMasking()) {
5565     FoldTailByMasking = true;
5566     return MaxVF;
5567   }
5568 
5569   // If there was a tail-folding hint/switch, but we can't fold the tail by
5570   // masking, fallback to a vectorization with a scalar epilogue.
5571   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5572     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5573                          "scalar epilogue instead.\n");
5574     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5575     return MaxVF;
5576   }
5577 
5578   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5579     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5580     return None;
5581   }
5582 
5583   if (TC == 0) {
5584     reportVectorizationFailure(
5585         "Unable to calculate the loop count due to complex control flow",
5586         "unable to calculate the loop count due to complex control flow",
5587         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5588     return None;
5589   }
5590 
5591   reportVectorizationFailure(
5592       "Cannot optimize for size and vectorize at the same time.",
5593       "cannot optimize for size and vectorize at the same time. "
5594       "Enable vectorization of this loop with '#pragma clang loop "
5595       "vectorize(enable)' when compiling with -Os/-Oz",
5596       "NoTailLoopWithOptForSize", ORE, TheLoop);
5597   return None;
5598 }
5599 
5600 ElementCount
5601 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5602                                                  ElementCount UserVF) {
5603   bool IgnoreScalableUserVF = UserVF.isScalable() &&
5604                               !TTI.supportsScalableVectors() &&
5605                               !ForceTargetSupportsScalableVectors;
5606   if (IgnoreScalableUserVF) {
5607     LLVM_DEBUG(
5608         dbgs() << "LV: Ignoring VF=" << UserVF
5609                << " because target does not support scalable vectors.\n");
5610     ORE->emit([&]() {
5611       return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF",
5612                                         TheLoop->getStartLoc(),
5613                                         TheLoop->getHeader())
5614              << "Ignoring VF=" << ore::NV("UserVF", UserVF)
5615              << " because target does not support scalable vectors.";
5616     });
5617   }
5618 
5619   // Beyond this point two scenarios are handled. If UserVF isn't specified
5620   // then a suitable VF is chosen. If UserVF is specified and there are
5621   // dependencies, check if it's legal. However, if a UserVF is specified and
5622   // there are no dependencies, then there's nothing to do.
5623   if (UserVF.isNonZero() && !IgnoreScalableUserVF &&
5624       Legal->isSafeForAnyVectorWidth())
5625     return UserVF;
5626 
5627   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5628   unsigned SmallestType, WidestType;
5629   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5630   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5631 
5632   // Get the maximum safe dependence distance in bits computed by LAA.
5633   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5634   // the memory accesses that is most restrictive (involved in the smallest
5635   // dependence distance).
5636   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5637 
5638   // If the user vectorization factor is legally unsafe, clamp it to a safe
5639   // value. Otherwise, return as is.
5640   if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
5641     unsigned MaxSafeElements =
5642         PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5643     ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
5644 
5645     if (UserVF.isScalable()) {
5646       Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5647 
5648       // Scale VF by vscale before checking if it's safe.
5649       MaxSafeVF = ElementCount::getScalable(
5650           MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5651 
5652       if (MaxSafeVF.isZero()) {
5653         // The dependence distance is too small to use scalable vectors,
5654         // fallback on fixed.
5655         LLVM_DEBUG(
5656             dbgs()
5657             << "LV: Max legal vector width too small, scalable vectorization "
5658                "unfeasible. Using fixed-width vectorization instead.\n");
5659         ORE->emit([&]() {
5660           return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
5661                                             TheLoop->getStartLoc(),
5662                                             TheLoop->getHeader())
5663                  << "Max legal vector width too small, scalable vectorization "
5664                  << "unfeasible. Using fixed-width vectorization instead.";
5665         });
5666         return computeFeasibleMaxVF(
5667             ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
5668       }
5669     }
5670 
5671     LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
5672 
5673     if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
5674       return UserVF;
5675 
5676     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5677                       << " is unsafe, clamping to max safe VF=" << MaxSafeVF
5678                       << ".\n");
5679     ORE->emit([&]() {
5680       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5681                                         TheLoop->getStartLoc(),
5682                                         TheLoop->getHeader())
5683              << "User-specified vectorization factor "
5684              << ore::NV("UserVectorizationFactor", UserVF)
5685              << " is unsafe, clamping to maximum safe vectorization factor "
5686              << ore::NV("VectorizationFactor", MaxSafeVF);
5687     });
5688     return MaxSafeVF;
5689   }
5690 
5691   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5692 
5693   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5694   // Note that both WidestRegister and WidestType may not be a powers of 2.
5695   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5696 
5697   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5698                     << " / " << WidestType << " bits.\n");
5699   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5700                     << WidestRegister << " bits.\n");
5701 
5702   assert(MaxVectorSize <= WidestRegister &&
5703          "Did not expect to pack so many elements"
5704          " into one vector!");
5705   if (MaxVectorSize == 0) {
5706     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5707     MaxVectorSize = 1;
5708     return ElementCount::getFixed(MaxVectorSize);
5709   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5710              isPowerOf2_32(ConstTripCount)) {
5711     // We need to clamp the VF to be the ConstTripCount. There is no point in
5712     // choosing a higher viable VF as done in the loop below.
5713     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5714                       << ConstTripCount << "\n");
5715     MaxVectorSize = ConstTripCount;
5716     return ElementCount::getFixed(MaxVectorSize);
5717   }
5718 
5719   unsigned MaxVF = MaxVectorSize;
5720   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5721       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5722     // Collect all viable vectorization factors larger than the default MaxVF
5723     // (i.e. MaxVectorSize).
5724     SmallVector<ElementCount, 8> VFs;
5725     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5726     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5727       VFs.push_back(ElementCount::getFixed(VS));
5728 
5729     // For each VF calculate its register usage.
5730     auto RUs = calculateRegisterUsage(VFs);
5731 
5732     // Select the largest VF which doesn't require more registers than existing
5733     // ones.
5734     for (int i = RUs.size() - 1; i >= 0; --i) {
5735       bool Selected = true;
5736       for (auto& pair : RUs[i].MaxLocalUsers) {
5737         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5738         if (pair.second > TargetNumRegisters)
5739           Selected = false;
5740       }
5741       if (Selected) {
5742         MaxVF = VFs[i].getKnownMinValue();
5743         break;
5744       }
5745     }
5746     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5747       if (MaxVF < MinVF) {
5748         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5749                           << ") with target's minimum: " << MinVF << '\n');
5750         MaxVF = MinVF;
5751       }
5752     }
5753   }
5754   return ElementCount::getFixed(MaxVF);
5755 }
5756 
5757 VectorizationFactor
5758 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5759   // FIXME: This can be fixed for scalable vectors later, because at this stage
5760   // the LoopVectorizer will only consider vectorizing a loop with scalable
5761   // vectors when the loop has a hint to enable vectorization for a given VF.
5762   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5763 
5764   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5765   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5766   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5767 
5768   unsigned Width = 1;
5769   const float ScalarCost = *ExpectedCost.getValue();
5770   float Cost = ScalarCost;
5771 
5772   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5773   if (ForceVectorization && MaxVF.isVector()) {
5774     // Ignore scalar width, because the user explicitly wants vectorization.
5775     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5776     // evaluation.
5777     Cost = std::numeric_limits<float>::max();
5778   }
5779 
5780   for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
5781     // Notice that the vector loop needs to be executed less times, so
5782     // we need to divide the cost of the vector loops by the width of
5783     // the vector elements.
5784     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5785     assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
5786     float VectorCost = *C.first.getValue() / (float)i;
5787     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5788                       << " costs: " << (int)VectorCost << ".\n");
5789     if (!C.second && !ForceVectorization) {
5790       LLVM_DEBUG(
5791           dbgs() << "LV: Not considering vector loop of width " << i
5792                  << " because it will not generate any vector instructions.\n");
5793       continue;
5794     }
5795 
5796     // If profitable add it to ProfitableVF list.
5797     if (VectorCost < ScalarCost) {
5798       ProfitableVFs.push_back(VectorizationFactor(
5799           {ElementCount::getFixed(i), (unsigned)VectorCost}));
5800     }
5801 
5802     if (VectorCost < Cost) {
5803       Cost = VectorCost;
5804       Width = i;
5805     }
5806   }
5807 
5808   if (!EnableCondStoresVectorization && NumPredStores) {
5809     reportVectorizationFailure("There are conditional stores.",
5810         "store that is conditionally executed prevents vectorization",
5811         "ConditionalStore", ORE, TheLoop);
5812     Width = 1;
5813     Cost = ScalarCost;
5814   }
5815 
5816   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5817              << "LV: Vectorization seems to be not beneficial, "
5818              << "but was forced by a user.\n");
5819   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5820   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5821                                 (unsigned)(Width * Cost)};
5822   return Factor;
5823 }
5824 
5825 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5826     const Loop &L, ElementCount VF) const {
5827   // Cross iteration phis such as reductions need special handling and are
5828   // currently unsupported.
5829   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5830         return Legal->isFirstOrderRecurrence(&Phi) ||
5831                Legal->isReductionVariable(&Phi);
5832       }))
5833     return false;
5834 
5835   // Phis with uses outside of the loop require special handling and are
5836   // currently unsupported.
5837   for (auto &Entry : Legal->getInductionVars()) {
5838     // Look for uses of the value of the induction at the last iteration.
5839     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5840     for (User *U : PostInc->users())
5841       if (!L.contains(cast<Instruction>(U)))
5842         return false;
5843     // Look for uses of penultimate value of the induction.
5844     for (User *U : Entry.first->users())
5845       if (!L.contains(cast<Instruction>(U)))
5846         return false;
5847   }
5848 
5849   // Induction variables that are widened require special handling that is
5850   // currently not supported.
5851   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5852         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5853                  this->isProfitableToScalarize(Entry.first, VF));
5854       }))
5855     return false;
5856 
5857   return true;
5858 }
5859 
5860 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5861     const ElementCount VF) const {
5862   // FIXME: We need a much better cost-model to take different parameters such
5863   // as register pressure, code size increase and cost of extra branches into
5864   // account. For now we apply a very crude heuristic and only consider loops
5865   // with vectorization factors larger than a certain value.
5866   // We also consider epilogue vectorization unprofitable for targets that don't
5867   // consider interleaving beneficial (eg. MVE).
5868   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5869     return false;
5870   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5871     return true;
5872   return false;
5873 }
5874 
5875 VectorizationFactor
5876 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5877     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5878   VectorizationFactor Result = VectorizationFactor::Disabled();
5879   if (!EnableEpilogueVectorization) {
5880     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5881     return Result;
5882   }
5883 
5884   if (!isScalarEpilogueAllowed()) {
5885     LLVM_DEBUG(
5886         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5887                   "allowed.\n";);
5888     return Result;
5889   }
5890 
5891   // FIXME: This can be fixed for scalable vectors later, because at this stage
5892   // the LoopVectorizer will only consider vectorizing a loop with scalable
5893   // vectors when the loop has a hint to enable vectorization for a given VF.
5894   if (MainLoopVF.isScalable()) {
5895     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
5896                          "yet supported.\n");
5897     return Result;
5898   }
5899 
5900   // Not really a cost consideration, but check for unsupported cases here to
5901   // simplify the logic.
5902   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5903     LLVM_DEBUG(
5904         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5905                   "not a supported candidate.\n";);
5906     return Result;
5907   }
5908 
5909   if (EpilogueVectorizationForceVF > 1) {
5910     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5911     if (LVP.hasPlanWithVFs(
5912             {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
5913       return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
5914     else {
5915       LLVM_DEBUG(
5916           dbgs()
5917               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5918       return Result;
5919     }
5920   }
5921 
5922   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5923       TheLoop->getHeader()->getParent()->hasMinSize()) {
5924     LLVM_DEBUG(
5925         dbgs()
5926             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5927     return Result;
5928   }
5929 
5930   if (!isEpilogueVectorizationProfitable(MainLoopVF))
5931     return Result;
5932 
5933   for (auto &NextVF : ProfitableVFs)
5934     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
5935         (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
5936         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
5937       Result = NextVF;
5938 
5939   if (Result != VectorizationFactor::Disabled())
5940     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5941                       << Result.Width.getFixedValue() << "\n";);
5942   return Result;
5943 }
5944 
5945 std::pair<unsigned, unsigned>
5946 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5947   unsigned MinWidth = -1U;
5948   unsigned MaxWidth = 8;
5949   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5950 
5951   // For each block.
5952   for (BasicBlock *BB : TheLoop->blocks()) {
5953     // For each instruction in the loop.
5954     for (Instruction &I : BB->instructionsWithoutDebug()) {
5955       Type *T = I.getType();
5956 
5957       // Skip ignored values.
5958       if (ValuesToIgnore.count(&I))
5959         continue;
5960 
5961       // Only examine Loads, Stores and PHINodes.
5962       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5963         continue;
5964 
5965       // Examine PHI nodes that are reduction variables. Update the type to
5966       // account for the recurrence type.
5967       if (auto *PN = dyn_cast<PHINode>(&I)) {
5968         if (!Legal->isReductionVariable(PN))
5969           continue;
5970         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5971         T = RdxDesc.getRecurrenceType();
5972       }
5973 
5974       // Examine the stored values.
5975       if (auto *ST = dyn_cast<StoreInst>(&I))
5976         T = ST->getValueOperand()->getType();
5977 
5978       // Ignore loaded pointer types and stored pointer types that are not
5979       // vectorizable.
5980       //
5981       // FIXME: The check here attempts to predict whether a load or store will
5982       //        be vectorized. We only know this for certain after a VF has
5983       //        been selected. Here, we assume that if an access can be
5984       //        vectorized, it will be. We should also look at extending this
5985       //        optimization to non-pointer types.
5986       //
5987       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5988           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5989         continue;
5990 
5991       MinWidth = std::min(MinWidth,
5992                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5993       MaxWidth = std::max(MaxWidth,
5994                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5995     }
5996   }
5997 
5998   return {MinWidth, MaxWidth};
5999 }
6000 
6001 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6002                                                            unsigned LoopCost) {
6003   // -- The interleave heuristics --
6004   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6005   // There are many micro-architectural considerations that we can't predict
6006   // at this level. For example, frontend pressure (on decode or fetch) due to
6007   // code size, or the number and capabilities of the execution ports.
6008   //
6009   // We use the following heuristics to select the interleave count:
6010   // 1. If the code has reductions, then we interleave to break the cross
6011   // iteration dependency.
6012   // 2. If the loop is really small, then we interleave to reduce the loop
6013   // overhead.
6014   // 3. We don't interleave if we think that we will spill registers to memory
6015   // due to the increased register pressure.
6016 
6017   if (!isScalarEpilogueAllowed())
6018     return 1;
6019 
6020   // We used the distance for the interleave count.
6021   if (Legal->getMaxSafeDepDistBytes() != -1U)
6022     return 1;
6023 
6024   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6025   const bool HasReductions = !Legal->getReductionVars().empty();
6026   // Do not interleave loops with a relatively small known or estimated trip
6027   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6028   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6029   // because with the above conditions interleaving can expose ILP and break
6030   // cross iteration dependences for reductions.
6031   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6032       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6033     return 1;
6034 
6035   RegisterUsage R = calculateRegisterUsage({VF})[0];
6036   // We divide by these constants so assume that we have at least one
6037   // instruction that uses at least one register.
6038   for (auto& pair : R.MaxLocalUsers) {
6039     pair.second = std::max(pair.second, 1U);
6040   }
6041 
6042   // We calculate the interleave count using the following formula.
6043   // Subtract the number of loop invariants from the number of available
6044   // registers. These registers are used by all of the interleaved instances.
6045   // Next, divide the remaining registers by the number of registers that is
6046   // required by the loop, in order to estimate how many parallel instances
6047   // fit without causing spills. All of this is rounded down if necessary to be
6048   // a power of two. We want power of two interleave count to simplify any
6049   // addressing operations or alignment considerations.
6050   // We also want power of two interleave counts to ensure that the induction
6051   // variable of the vector loop wraps to zero, when tail is folded by masking;
6052   // this currently happens when OptForSize, in which case IC is set to 1 above.
6053   unsigned IC = UINT_MAX;
6054 
6055   for (auto& pair : R.MaxLocalUsers) {
6056     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6057     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6058                       << " registers of "
6059                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6060     if (VF.isScalar()) {
6061       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6062         TargetNumRegisters = ForceTargetNumScalarRegs;
6063     } else {
6064       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6065         TargetNumRegisters = ForceTargetNumVectorRegs;
6066     }
6067     unsigned MaxLocalUsers = pair.second;
6068     unsigned LoopInvariantRegs = 0;
6069     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6070       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6071 
6072     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6073     // Don't count the induction variable as interleaved.
6074     if (EnableIndVarRegisterHeur) {
6075       TmpIC =
6076           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6077                         std::max(1U, (MaxLocalUsers - 1)));
6078     }
6079 
6080     IC = std::min(IC, TmpIC);
6081   }
6082 
6083   // Clamp the interleave ranges to reasonable counts.
6084   unsigned MaxInterleaveCount =
6085       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6086 
6087   // Check if the user has overridden the max.
6088   if (VF.isScalar()) {
6089     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6090       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6091   } else {
6092     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6093       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6094   }
6095 
6096   // If trip count is known or estimated compile time constant, limit the
6097   // interleave count to be less than the trip count divided by VF, provided it
6098   // is at least 1.
6099   //
6100   // For scalable vectors we can't know if interleaving is beneficial. It may
6101   // not be beneficial for small loops if none of the lanes in the second vector
6102   // iterations is enabled. However, for larger loops, there is likely to be a
6103   // similar benefit as for fixed-width vectors. For now, we choose to leave
6104   // the InterleaveCount as if vscale is '1', although if some information about
6105   // the vector is known (e.g. min vector size), we can make a better decision.
6106   if (BestKnownTC) {
6107     MaxInterleaveCount =
6108         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6109     // Make sure MaxInterleaveCount is greater than 0.
6110     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6111   }
6112 
6113   assert(MaxInterleaveCount > 0 &&
6114          "Maximum interleave count must be greater than 0");
6115 
6116   // Clamp the calculated IC to be between the 1 and the max interleave count
6117   // that the target and trip count allows.
6118   if (IC > MaxInterleaveCount)
6119     IC = MaxInterleaveCount;
6120   else
6121     // Make sure IC is greater than 0.
6122     IC = std::max(1u, IC);
6123 
6124   assert(IC > 0 && "Interleave count must be greater than 0.");
6125 
6126   // If we did not calculate the cost for VF (because the user selected the VF)
6127   // then we calculate the cost of VF here.
6128   if (LoopCost == 0) {
6129     assert(expectedCost(VF).first.isValid() && "Expected a valid cost");
6130     LoopCost = *expectedCost(VF).first.getValue();
6131   }
6132 
6133   assert(LoopCost && "Non-zero loop cost expected");
6134 
6135   // Interleave if we vectorized this loop and there is a reduction that could
6136   // benefit from interleaving.
6137   if (VF.isVector() && HasReductions) {
6138     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6139     return IC;
6140   }
6141 
6142   // Note that if we've already vectorized the loop we will have done the
6143   // runtime check and so interleaving won't require further checks.
6144   bool InterleavingRequiresRuntimePointerCheck =
6145       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6146 
6147   // We want to interleave small loops in order to reduce the loop overhead and
6148   // potentially expose ILP opportunities.
6149   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6150                     << "LV: IC is " << IC << '\n'
6151                     << "LV: VF is " << VF << '\n');
6152   const bool AggressivelyInterleaveReductions =
6153       TTI.enableAggressiveInterleaving(HasReductions);
6154   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6155     // We assume that the cost overhead is 1 and we use the cost model
6156     // to estimate the cost of the loop and interleave until the cost of the
6157     // loop overhead is about 5% of the cost of the loop.
6158     unsigned SmallIC =
6159         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6160 
6161     // Interleave until store/load ports (estimated by max interleave count) are
6162     // saturated.
6163     unsigned NumStores = Legal->getNumStores();
6164     unsigned NumLoads = Legal->getNumLoads();
6165     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6166     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6167 
6168     // If we have a scalar reduction (vector reductions are already dealt with
6169     // by this point), we can increase the critical path length if the loop
6170     // we're interleaving is inside another loop. Limit, by default to 2, so the
6171     // critical path only gets increased by one reduction operation.
6172     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6173       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6174       SmallIC = std::min(SmallIC, F);
6175       StoresIC = std::min(StoresIC, F);
6176       LoadsIC = std::min(LoadsIC, F);
6177     }
6178 
6179     if (EnableLoadStoreRuntimeInterleave &&
6180         std::max(StoresIC, LoadsIC) > SmallIC) {
6181       LLVM_DEBUG(
6182           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6183       return std::max(StoresIC, LoadsIC);
6184     }
6185 
6186     // If there are scalar reductions and TTI has enabled aggressive
6187     // interleaving for reductions, we will interleave to expose ILP.
6188     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6189         AggressivelyInterleaveReductions) {
6190       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6191       // Interleave no less than SmallIC but not as aggressive as the normal IC
6192       // to satisfy the rare situation when resources are too limited.
6193       return std::max(IC / 2, SmallIC);
6194     } else {
6195       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6196       return SmallIC;
6197     }
6198   }
6199 
6200   // Interleave if this is a large loop (small loops are already dealt with by
6201   // this point) that could benefit from interleaving.
6202   if (AggressivelyInterleaveReductions) {
6203     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6204     return IC;
6205   }
6206 
6207   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6208   return 1;
6209 }
6210 
6211 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6212 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6213   // This function calculates the register usage by measuring the highest number
6214   // of values that are alive at a single location. Obviously, this is a very
6215   // rough estimation. We scan the loop in a topological order in order and
6216   // assign a number to each instruction. We use RPO to ensure that defs are
6217   // met before their users. We assume that each instruction that has in-loop
6218   // users starts an interval. We record every time that an in-loop value is
6219   // used, so we have a list of the first and last occurrences of each
6220   // instruction. Next, we transpose this data structure into a multi map that
6221   // holds the list of intervals that *end* at a specific location. This multi
6222   // map allows us to perform a linear search. We scan the instructions linearly
6223   // and record each time that a new interval starts, by placing it in a set.
6224   // If we find this value in the multi-map then we remove it from the set.
6225   // The max register usage is the maximum size of the set.
6226   // We also search for instructions that are defined outside the loop, but are
6227   // used inside the loop. We need this number separately from the max-interval
6228   // usage number because when we unroll, loop-invariant values do not take
6229   // more register.
6230   LoopBlocksDFS DFS(TheLoop);
6231   DFS.perform(LI);
6232 
6233   RegisterUsage RU;
6234 
6235   // Each 'key' in the map opens a new interval. The values
6236   // of the map are the index of the 'last seen' usage of the
6237   // instruction that is the key.
6238   using IntervalMap = DenseMap<Instruction *, unsigned>;
6239 
6240   // Maps instruction to its index.
6241   SmallVector<Instruction *, 64> IdxToInstr;
6242   // Marks the end of each interval.
6243   IntervalMap EndPoint;
6244   // Saves the list of instruction indices that are used in the loop.
6245   SmallPtrSet<Instruction *, 8> Ends;
6246   // Saves the list of values that are used in the loop but are
6247   // defined outside the loop, such as arguments and constants.
6248   SmallPtrSet<Value *, 8> LoopInvariants;
6249 
6250   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6251     for (Instruction &I : BB->instructionsWithoutDebug()) {
6252       IdxToInstr.push_back(&I);
6253 
6254       // Save the end location of each USE.
6255       for (Value *U : I.operands()) {
6256         auto *Instr = dyn_cast<Instruction>(U);
6257 
6258         // Ignore non-instruction values such as arguments, constants, etc.
6259         if (!Instr)
6260           continue;
6261 
6262         // If this instruction is outside the loop then record it and continue.
6263         if (!TheLoop->contains(Instr)) {
6264           LoopInvariants.insert(Instr);
6265           continue;
6266         }
6267 
6268         // Overwrite previous end points.
6269         EndPoint[Instr] = IdxToInstr.size();
6270         Ends.insert(Instr);
6271       }
6272     }
6273   }
6274 
6275   // Saves the list of intervals that end with the index in 'key'.
6276   using InstrList = SmallVector<Instruction *, 2>;
6277   DenseMap<unsigned, InstrList> TransposeEnds;
6278 
6279   // Transpose the EndPoints to a list of values that end at each index.
6280   for (auto &Interval : EndPoint)
6281     TransposeEnds[Interval.second].push_back(Interval.first);
6282 
6283   SmallPtrSet<Instruction *, 8> OpenIntervals;
6284   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6285   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6286 
6287   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6288 
6289   // A lambda that gets the register usage for the given type and VF.
6290   const auto &TTICapture = TTI;
6291   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6292     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6293       return 0U;
6294     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6295   };
6296 
6297   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6298     Instruction *I = IdxToInstr[i];
6299 
6300     // Remove all of the instructions that end at this location.
6301     InstrList &List = TransposeEnds[i];
6302     for (Instruction *ToRemove : List)
6303       OpenIntervals.erase(ToRemove);
6304 
6305     // Ignore instructions that are never used within the loop.
6306     if (!Ends.count(I))
6307       continue;
6308 
6309     // Skip ignored values.
6310     if (ValuesToIgnore.count(I))
6311       continue;
6312 
6313     // For each VF find the maximum usage of registers.
6314     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6315       // Count the number of live intervals.
6316       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6317 
6318       if (VFs[j].isScalar()) {
6319         for (auto Inst : OpenIntervals) {
6320           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6321           if (RegUsage.find(ClassID) == RegUsage.end())
6322             RegUsage[ClassID] = 1;
6323           else
6324             RegUsage[ClassID] += 1;
6325         }
6326       } else {
6327         collectUniformsAndScalars(VFs[j]);
6328         for (auto Inst : OpenIntervals) {
6329           // Skip ignored values for VF > 1.
6330           if (VecValuesToIgnore.count(Inst))
6331             continue;
6332           if (isScalarAfterVectorization(Inst, VFs[j])) {
6333             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6334             if (RegUsage.find(ClassID) == RegUsage.end())
6335               RegUsage[ClassID] = 1;
6336             else
6337               RegUsage[ClassID] += 1;
6338           } else {
6339             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6340             if (RegUsage.find(ClassID) == RegUsage.end())
6341               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6342             else
6343               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6344           }
6345         }
6346       }
6347 
6348       for (auto& pair : RegUsage) {
6349         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6350           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6351         else
6352           MaxUsages[j][pair.first] = pair.second;
6353       }
6354     }
6355 
6356     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6357                       << OpenIntervals.size() << '\n');
6358 
6359     // Add the current instruction to the list of open intervals.
6360     OpenIntervals.insert(I);
6361   }
6362 
6363   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6364     SmallMapVector<unsigned, unsigned, 4> Invariant;
6365 
6366     for (auto Inst : LoopInvariants) {
6367       unsigned Usage =
6368           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6369       unsigned ClassID =
6370           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6371       if (Invariant.find(ClassID) == Invariant.end())
6372         Invariant[ClassID] = Usage;
6373       else
6374         Invariant[ClassID] += Usage;
6375     }
6376 
6377     LLVM_DEBUG({
6378       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6379       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6380              << " item\n";
6381       for (const auto &pair : MaxUsages[i]) {
6382         dbgs() << "LV(REG): RegisterClass: "
6383                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6384                << " registers\n";
6385       }
6386       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6387              << " item\n";
6388       for (const auto &pair : Invariant) {
6389         dbgs() << "LV(REG): RegisterClass: "
6390                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6391                << " registers\n";
6392       }
6393     });
6394 
6395     RU.LoopInvariantRegs = Invariant;
6396     RU.MaxLocalUsers = MaxUsages[i];
6397     RUs[i] = RU;
6398   }
6399 
6400   return RUs;
6401 }
6402 
6403 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6404   // TODO: Cost model for emulated masked load/store is completely
6405   // broken. This hack guides the cost model to use an artificially
6406   // high enough value to practically disable vectorization with such
6407   // operations, except where previously deployed legality hack allowed
6408   // using very low cost values. This is to avoid regressions coming simply
6409   // from moving "masked load/store" check from legality to cost model.
6410   // Masked Load/Gather emulation was previously never allowed.
6411   // Limited number of Masked Store/Scatter emulation was allowed.
6412   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
6413   return isa<LoadInst>(I) ||
6414          (isa<StoreInst>(I) &&
6415           NumPredStores > NumberOfStoresToPredicate);
6416 }
6417 
6418 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6419   // If we aren't vectorizing the loop, or if we've already collected the
6420   // instructions to scalarize, there's nothing to do. Collection may already
6421   // have occurred if we have a user-selected VF and are now computing the
6422   // expected cost for interleaving.
6423   if (VF.isScalar() || VF.isZero() ||
6424       InstsToScalarize.find(VF) != InstsToScalarize.end())
6425     return;
6426 
6427   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6428   // not profitable to scalarize any instructions, the presence of VF in the
6429   // map will indicate that we've analyzed it already.
6430   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6431 
6432   // Find all the instructions that are scalar with predication in the loop and
6433   // determine if it would be better to not if-convert the blocks they are in.
6434   // If so, we also record the instructions to scalarize.
6435   for (BasicBlock *BB : TheLoop->blocks()) {
6436     if (!blockNeedsPredication(BB))
6437       continue;
6438     for (Instruction &I : *BB)
6439       if (isScalarWithPredication(&I)) {
6440         ScalarCostsTy ScalarCosts;
6441         // Do not apply discount logic if hacked cost is needed
6442         // for emulated masked memrefs.
6443         if (!useEmulatedMaskMemRefHack(&I) &&
6444             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6445           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6446         // Remember that BB will remain after vectorization.
6447         PredicatedBBsAfterVectorization.insert(BB);
6448       }
6449   }
6450 }
6451 
6452 int LoopVectorizationCostModel::computePredInstDiscount(
6453     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6454   assert(!isUniformAfterVectorization(PredInst, VF) &&
6455          "Instruction marked uniform-after-vectorization will be predicated");
6456 
6457   // Initialize the discount to zero, meaning that the scalar version and the
6458   // vector version cost the same.
6459   InstructionCost Discount = 0;
6460 
6461   // Holds instructions to analyze. The instructions we visit are mapped in
6462   // ScalarCosts. Those instructions are the ones that would be scalarized if
6463   // we find that the scalar version costs less.
6464   SmallVector<Instruction *, 8> Worklist;
6465 
6466   // Returns true if the given instruction can be scalarized.
6467   auto canBeScalarized = [&](Instruction *I) -> bool {
6468     // We only attempt to scalarize instructions forming a single-use chain
6469     // from the original predicated block that would otherwise be vectorized.
6470     // Although not strictly necessary, we give up on instructions we know will
6471     // already be scalar to avoid traversing chains that are unlikely to be
6472     // beneficial.
6473     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6474         isScalarAfterVectorization(I, VF))
6475       return false;
6476 
6477     // If the instruction is scalar with predication, it will be analyzed
6478     // separately. We ignore it within the context of PredInst.
6479     if (isScalarWithPredication(I))
6480       return false;
6481 
6482     // If any of the instruction's operands are uniform after vectorization,
6483     // the instruction cannot be scalarized. This prevents, for example, a
6484     // masked load from being scalarized.
6485     //
6486     // We assume we will only emit a value for lane zero of an instruction
6487     // marked uniform after vectorization, rather than VF identical values.
6488     // Thus, if we scalarize an instruction that uses a uniform, we would
6489     // create uses of values corresponding to the lanes we aren't emitting code
6490     // for. This behavior can be changed by allowing getScalarValue to clone
6491     // the lane zero values for uniforms rather than asserting.
6492     for (Use &U : I->operands())
6493       if (auto *J = dyn_cast<Instruction>(U.get()))
6494         if (isUniformAfterVectorization(J, VF))
6495           return false;
6496 
6497     // Otherwise, we can scalarize the instruction.
6498     return true;
6499   };
6500 
6501   // Compute the expected cost discount from scalarizing the entire expression
6502   // feeding the predicated instruction. We currently only consider expressions
6503   // that are single-use instruction chains.
6504   Worklist.push_back(PredInst);
6505   while (!Worklist.empty()) {
6506     Instruction *I = Worklist.pop_back_val();
6507 
6508     // If we've already analyzed the instruction, there's nothing to do.
6509     if (ScalarCosts.find(I) != ScalarCosts.end())
6510       continue;
6511 
6512     // Compute the cost of the vector instruction. Note that this cost already
6513     // includes the scalarization overhead of the predicated instruction.
6514     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6515 
6516     // Compute the cost of the scalarized instruction. This cost is the cost of
6517     // the instruction as if it wasn't if-converted and instead remained in the
6518     // predicated block. We will scale this cost by block probability after
6519     // computing the scalarization overhead.
6520     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6521     InstructionCost ScalarCost =
6522         VF.getKnownMinValue() *
6523         getInstructionCost(I, ElementCount::getFixed(1)).first;
6524 
6525     // Compute the scalarization overhead of needed insertelement instructions
6526     // and phi nodes.
6527     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6528       ScalarCost += TTI.getScalarizationOverhead(
6529           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6530           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6531       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6532       ScalarCost +=
6533           VF.getKnownMinValue() *
6534           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6535     }
6536 
6537     // Compute the scalarization overhead of needed extractelement
6538     // instructions. For each of the instruction's operands, if the operand can
6539     // be scalarized, add it to the worklist; otherwise, account for the
6540     // overhead.
6541     for (Use &U : I->operands())
6542       if (auto *J = dyn_cast<Instruction>(U.get())) {
6543         assert(VectorType::isValidElementType(J->getType()) &&
6544                "Instruction has non-scalar type");
6545         if (canBeScalarized(J))
6546           Worklist.push_back(J);
6547         else if (needsExtract(J, VF)) {
6548           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6549           ScalarCost += TTI.getScalarizationOverhead(
6550               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6551               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6552         }
6553       }
6554 
6555     // Scale the total scalar cost by block probability.
6556     ScalarCost /= getReciprocalPredBlockProb();
6557 
6558     // Compute the discount. A non-negative discount means the vector version
6559     // of the instruction costs more, and scalarizing would be beneficial.
6560     Discount += VectorCost - ScalarCost;
6561     ScalarCosts[I] = ScalarCost;
6562   }
6563 
6564   return *Discount.getValue();
6565 }
6566 
6567 LoopVectorizationCostModel::VectorizationCostTy
6568 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6569   VectorizationCostTy Cost;
6570 
6571   // For each block.
6572   for (BasicBlock *BB : TheLoop->blocks()) {
6573     VectorizationCostTy BlockCost;
6574 
6575     // For each instruction in the old loop.
6576     for (Instruction &I : BB->instructionsWithoutDebug()) {
6577       // Skip ignored values.
6578       if (ValuesToIgnore.count(&I) ||
6579           (VF.isVector() && VecValuesToIgnore.count(&I)))
6580         continue;
6581 
6582       VectorizationCostTy C = getInstructionCost(&I, VF);
6583 
6584       // Check if we should override the cost.
6585       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6586         C.first = InstructionCost(ForceTargetInstructionCost);
6587 
6588       BlockCost.first += C.first;
6589       BlockCost.second |= C.second;
6590       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6591                         << " for VF " << VF << " For instruction: " << I
6592                         << '\n');
6593     }
6594 
6595     // If we are vectorizing a predicated block, it will have been
6596     // if-converted. This means that the block's instructions (aside from
6597     // stores and instructions that may divide by zero) will now be
6598     // unconditionally executed. For the scalar case, we may not always execute
6599     // the predicated block, if it is an if-else block. Thus, scale the block's
6600     // cost by the probability of executing it. blockNeedsPredication from
6601     // Legal is used so as to not include all blocks in tail folded loops.
6602     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6603       BlockCost.first /= getReciprocalPredBlockProb();
6604 
6605     Cost.first += BlockCost.first;
6606     Cost.second |= BlockCost.second;
6607   }
6608 
6609   return Cost;
6610 }
6611 
6612 /// Gets Address Access SCEV after verifying that the access pattern
6613 /// is loop invariant except the induction variable dependence.
6614 ///
6615 /// This SCEV can be sent to the Target in order to estimate the address
6616 /// calculation cost.
6617 static const SCEV *getAddressAccessSCEV(
6618               Value *Ptr,
6619               LoopVectorizationLegality *Legal,
6620               PredicatedScalarEvolution &PSE,
6621               const Loop *TheLoop) {
6622 
6623   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6624   if (!Gep)
6625     return nullptr;
6626 
6627   // We are looking for a gep with all loop invariant indices except for one
6628   // which should be an induction variable.
6629   auto SE = PSE.getSE();
6630   unsigned NumOperands = Gep->getNumOperands();
6631   for (unsigned i = 1; i < NumOperands; ++i) {
6632     Value *Opd = Gep->getOperand(i);
6633     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6634         !Legal->isInductionVariable(Opd))
6635       return nullptr;
6636   }
6637 
6638   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6639   return PSE.getSCEV(Ptr);
6640 }
6641 
6642 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6643   return Legal->hasStride(I->getOperand(0)) ||
6644          Legal->hasStride(I->getOperand(1));
6645 }
6646 
6647 unsigned
6648 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6649                                                         ElementCount VF) {
6650   assert(VF.isVector() &&
6651          "Scalarization cost of instruction implies vectorization.");
6652   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6653   Type *ValTy = getMemInstValueType(I);
6654   auto SE = PSE.getSE();
6655 
6656   unsigned AS = getLoadStoreAddressSpace(I);
6657   Value *Ptr = getLoadStorePointerOperand(I);
6658   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6659 
6660   // Figure out whether the access is strided and get the stride value
6661   // if it's known in compile time
6662   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6663 
6664   // Get the cost of the scalar memory instruction and address computation.
6665   unsigned Cost =
6666       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6667 
6668   // Don't pass *I here, since it is scalar but will actually be part of a
6669   // vectorized loop where the user of it is a vectorized instruction.
6670   const Align Alignment = getLoadStoreAlignment(I);
6671   Cost += VF.getKnownMinValue() *
6672           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6673                               AS, TTI::TCK_RecipThroughput);
6674 
6675   // Get the overhead of the extractelement and insertelement instructions
6676   // we might create due to scalarization.
6677   Cost += getScalarizationOverhead(I, VF);
6678 
6679   // If we have a predicated store, it may not be executed for each vector
6680   // lane. Scale the cost by the probability of executing the predicated
6681   // block.
6682   if (isPredicatedInst(I)) {
6683     Cost /= getReciprocalPredBlockProb();
6684 
6685     if (useEmulatedMaskMemRefHack(I))
6686       // Artificially setting to a high enough value to practically disable
6687       // vectorization with such operations.
6688       Cost = 3000000;
6689   }
6690 
6691   return Cost;
6692 }
6693 
6694 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6695                                                              ElementCount VF) {
6696   Type *ValTy = getMemInstValueType(I);
6697   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6698   Value *Ptr = getLoadStorePointerOperand(I);
6699   unsigned AS = getLoadStoreAddressSpace(I);
6700   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6701   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6702 
6703   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6704          "Stride should be 1 or -1 for consecutive memory access");
6705   const Align Alignment = getLoadStoreAlignment(I);
6706   unsigned Cost = 0;
6707   if (Legal->isMaskRequired(I))
6708     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6709                                       CostKind);
6710   else
6711     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6712                                 CostKind, I);
6713 
6714   bool Reverse = ConsecutiveStride < 0;
6715   if (Reverse)
6716     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6717   return Cost;
6718 }
6719 
6720 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6721                                                          ElementCount VF) {
6722   assert(Legal->isUniformMemOp(*I));
6723 
6724   Type *ValTy = getMemInstValueType(I);
6725   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6726   const Align Alignment = getLoadStoreAlignment(I);
6727   unsigned AS = getLoadStoreAddressSpace(I);
6728   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6729   if (isa<LoadInst>(I)) {
6730     return TTI.getAddressComputationCost(ValTy) +
6731            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6732                                CostKind) +
6733            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6734   }
6735   StoreInst *SI = cast<StoreInst>(I);
6736 
6737   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6738   return TTI.getAddressComputationCost(ValTy) +
6739          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6740                              CostKind) +
6741          (isLoopInvariantStoreValue
6742               ? 0
6743               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6744                                        VF.getKnownMinValue() - 1));
6745 }
6746 
6747 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6748                                                           ElementCount VF) {
6749   Type *ValTy = getMemInstValueType(I);
6750   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6751   const Align Alignment = getLoadStoreAlignment(I);
6752   const Value *Ptr = getLoadStorePointerOperand(I);
6753 
6754   return TTI.getAddressComputationCost(VectorTy) +
6755          TTI.getGatherScatterOpCost(
6756              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6757              TargetTransformInfo::TCK_RecipThroughput, I);
6758 }
6759 
6760 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6761                                                             ElementCount VF) {
6762   Type *ValTy = getMemInstValueType(I);
6763   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6764   unsigned AS = getLoadStoreAddressSpace(I);
6765 
6766   auto Group = getInterleavedAccessGroup(I);
6767   assert(Group && "Fail to get an interleaved access group.");
6768 
6769   unsigned InterleaveFactor = Group->getFactor();
6770   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6771   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6772 
6773   // Holds the indices of existing members in an interleaved load group.
6774   // An interleaved store group doesn't need this as it doesn't allow gaps.
6775   SmallVector<unsigned, 4> Indices;
6776   if (isa<LoadInst>(I)) {
6777     for (unsigned i = 0; i < InterleaveFactor; i++)
6778       if (Group->getMember(i))
6779         Indices.push_back(i);
6780   }
6781 
6782   // Calculate the cost of the whole interleaved group.
6783   bool UseMaskForGaps =
6784       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6785   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6786       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6787       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6788 
6789   if (Group->isReverse()) {
6790     // TODO: Add support for reversed masked interleaved access.
6791     assert(!Legal->isMaskRequired(I) &&
6792            "Reverse masked interleaved access not supported.");
6793     Cost += Group->getNumMembers() *
6794             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6795   }
6796   return Cost;
6797 }
6798 
6799 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6800                                                               ElementCount VF) {
6801   // Calculate scalar cost only. Vectorization cost should be ready at this
6802   // moment.
6803   if (VF.isScalar()) {
6804     Type *ValTy = getMemInstValueType(I);
6805     const Align Alignment = getLoadStoreAlignment(I);
6806     unsigned AS = getLoadStoreAddressSpace(I);
6807 
6808     return TTI.getAddressComputationCost(ValTy) +
6809            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6810                                TTI::TCK_RecipThroughput, I);
6811   }
6812   return getWideningCost(I, VF);
6813 }
6814 
6815 LoopVectorizationCostModel::VectorizationCostTy
6816 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6817                                                ElementCount VF) {
6818   // If we know that this instruction will remain uniform, check the cost of
6819   // the scalar version.
6820   if (isUniformAfterVectorization(I, VF))
6821     VF = ElementCount::getFixed(1);
6822 
6823   if (VF.isVector() && isProfitableToScalarize(I, VF))
6824     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6825 
6826   // Forced scalars do not have any scalarization overhead.
6827   auto ForcedScalar = ForcedScalars.find(VF);
6828   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6829     auto InstSet = ForcedScalar->second;
6830     if (InstSet.count(I))
6831       return VectorizationCostTy(
6832           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6833            VF.getKnownMinValue()),
6834           false);
6835   }
6836 
6837   Type *VectorTy;
6838   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6839 
6840   bool TypeNotScalarized =
6841       VF.isVector() && VectorTy->isVectorTy() &&
6842       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6843   return VectorizationCostTy(C, TypeNotScalarized);
6844 }
6845 
6846 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6847                                                               ElementCount VF) {
6848 
6849   assert(!VF.isScalable() &&
6850          "cannot compute scalarization overhead for scalable vectorization");
6851   if (VF.isScalar())
6852     return 0;
6853 
6854   unsigned Cost = 0;
6855   Type *RetTy = ToVectorTy(I->getType(), VF);
6856   if (!RetTy->isVoidTy() &&
6857       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6858     Cost += TTI.getScalarizationOverhead(
6859         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6860         true, false);
6861 
6862   // Some targets keep addresses scalar.
6863   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6864     return Cost;
6865 
6866   // Some targets support efficient element stores.
6867   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6868     return Cost;
6869 
6870   // Collect operands to consider.
6871   CallInst *CI = dyn_cast<CallInst>(I);
6872   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6873 
6874   // Skip operands that do not require extraction/scalarization and do not incur
6875   // any overhead.
6876   return Cost + TTI.getOperandsScalarizationOverhead(
6877                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6878 }
6879 
6880 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6881   if (VF.isScalar())
6882     return;
6883   NumPredStores = 0;
6884   for (BasicBlock *BB : TheLoop->blocks()) {
6885     // For each instruction in the old loop.
6886     for (Instruction &I : *BB) {
6887       Value *Ptr =  getLoadStorePointerOperand(&I);
6888       if (!Ptr)
6889         continue;
6890 
6891       // TODO: We should generate better code and update the cost model for
6892       // predicated uniform stores. Today they are treated as any other
6893       // predicated store (see added test cases in
6894       // invariant-store-vectorization.ll).
6895       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6896         NumPredStores++;
6897 
6898       if (Legal->isUniformMemOp(I)) {
6899         // TODO: Avoid replicating loads and stores instead of
6900         // relying on instcombine to remove them.
6901         // Load: Scalar load + broadcast
6902         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6903         unsigned Cost = getUniformMemOpCost(&I, VF);
6904         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6905         continue;
6906       }
6907 
6908       // We assume that widening is the best solution when possible.
6909       if (memoryInstructionCanBeWidened(&I, VF)) {
6910         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6911         int ConsecutiveStride =
6912                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6913         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6914                "Expected consecutive stride.");
6915         InstWidening Decision =
6916             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6917         setWideningDecision(&I, VF, Decision, Cost);
6918         continue;
6919       }
6920 
6921       // Choose between Interleaving, Gather/Scatter or Scalarization.
6922       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6923       unsigned NumAccesses = 1;
6924       if (isAccessInterleaved(&I)) {
6925         auto Group = getInterleavedAccessGroup(&I);
6926         assert(Group && "Fail to get an interleaved access group.");
6927 
6928         // Make one decision for the whole group.
6929         if (getWideningDecision(&I, VF) != CM_Unknown)
6930           continue;
6931 
6932         NumAccesses = Group->getNumMembers();
6933         if (interleavedAccessCanBeWidened(&I, VF))
6934           InterleaveCost = getInterleaveGroupCost(&I, VF);
6935       }
6936 
6937       unsigned GatherScatterCost =
6938           isLegalGatherOrScatter(&I)
6939               ? getGatherScatterCost(&I, VF) * NumAccesses
6940               : std::numeric_limits<unsigned>::max();
6941 
6942       unsigned ScalarizationCost =
6943           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6944 
6945       // Choose better solution for the current VF,
6946       // write down this decision and use it during vectorization.
6947       unsigned Cost;
6948       InstWidening Decision;
6949       if (InterleaveCost <= GatherScatterCost &&
6950           InterleaveCost < ScalarizationCost) {
6951         Decision = CM_Interleave;
6952         Cost = InterleaveCost;
6953       } else if (GatherScatterCost < ScalarizationCost) {
6954         Decision = CM_GatherScatter;
6955         Cost = GatherScatterCost;
6956       } else {
6957         Decision = CM_Scalarize;
6958         Cost = ScalarizationCost;
6959       }
6960       // If the instructions belongs to an interleave group, the whole group
6961       // receives the same decision. The whole group receives the cost, but
6962       // the cost will actually be assigned to one instruction.
6963       if (auto Group = getInterleavedAccessGroup(&I))
6964         setWideningDecision(Group, VF, Decision, Cost);
6965       else
6966         setWideningDecision(&I, VF, Decision, Cost);
6967     }
6968   }
6969 
6970   // Make sure that any load of address and any other address computation
6971   // remains scalar unless there is gather/scatter support. This avoids
6972   // inevitable extracts into address registers, and also has the benefit of
6973   // activating LSR more, since that pass can't optimize vectorized
6974   // addresses.
6975   if (TTI.prefersVectorizedAddressing())
6976     return;
6977 
6978   // Start with all scalar pointer uses.
6979   SmallPtrSet<Instruction *, 8> AddrDefs;
6980   for (BasicBlock *BB : TheLoop->blocks())
6981     for (Instruction &I : *BB) {
6982       Instruction *PtrDef =
6983         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6984       if (PtrDef && TheLoop->contains(PtrDef) &&
6985           getWideningDecision(&I, VF) != CM_GatherScatter)
6986         AddrDefs.insert(PtrDef);
6987     }
6988 
6989   // Add all instructions used to generate the addresses.
6990   SmallVector<Instruction *, 4> Worklist;
6991   for (auto *I : AddrDefs)
6992     Worklist.push_back(I);
6993   while (!Worklist.empty()) {
6994     Instruction *I = Worklist.pop_back_val();
6995     for (auto &Op : I->operands())
6996       if (auto *InstOp = dyn_cast<Instruction>(Op))
6997         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6998             AddrDefs.insert(InstOp).second)
6999           Worklist.push_back(InstOp);
7000   }
7001 
7002   for (auto *I : AddrDefs) {
7003     if (isa<LoadInst>(I)) {
7004       // Setting the desired widening decision should ideally be handled in
7005       // by cost functions, but since this involves the task of finding out
7006       // if the loaded register is involved in an address computation, it is
7007       // instead changed here when we know this is the case.
7008       InstWidening Decision = getWideningDecision(I, VF);
7009       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7010         // Scalarize a widened load of address.
7011         setWideningDecision(
7012             I, VF, CM_Scalarize,
7013             (VF.getKnownMinValue() *
7014              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7015       else if (auto Group = getInterleavedAccessGroup(I)) {
7016         // Scalarize an interleave group of address loads.
7017         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7018           if (Instruction *Member = Group->getMember(I))
7019             setWideningDecision(
7020                 Member, VF, CM_Scalarize,
7021                 (VF.getKnownMinValue() *
7022                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7023         }
7024       }
7025     } else
7026       // Make sure I gets scalarized and a cost estimate without
7027       // scalarization overhead.
7028       ForcedScalars[VF].insert(I);
7029   }
7030 }
7031 
7032 InstructionCost
7033 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7034                                                Type *&VectorTy) {
7035   Type *RetTy = I->getType();
7036   if (canTruncateToMinimalBitwidth(I, VF))
7037     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7038   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
7039   auto SE = PSE.getSE();
7040   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7041 
7042   // TODO: We need to estimate the cost of intrinsic calls.
7043   switch (I->getOpcode()) {
7044   case Instruction::GetElementPtr:
7045     // We mark this instruction as zero-cost because the cost of GEPs in
7046     // vectorized code depends on whether the corresponding memory instruction
7047     // is scalarized or not. Therefore, we handle GEPs with the memory
7048     // instruction cost.
7049     return 0;
7050   case Instruction::Br: {
7051     // In cases of scalarized and predicated instructions, there will be VF
7052     // predicated blocks in the vectorized loop. Each branch around these
7053     // blocks requires also an extract of its vector compare i1 element.
7054     bool ScalarPredicatedBB = false;
7055     BranchInst *BI = cast<BranchInst>(I);
7056     if (VF.isVector() && BI->isConditional() &&
7057         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7058          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7059       ScalarPredicatedBB = true;
7060 
7061     if (ScalarPredicatedBB) {
7062       // Return cost for branches around scalarized and predicated blocks.
7063       assert(!VF.isScalable() && "scalable vectors not yet supported.");
7064       auto *Vec_i1Ty =
7065           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7066       return (TTI.getScalarizationOverhead(
7067                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
7068                   false, true) +
7069               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
7070                VF.getKnownMinValue()));
7071     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7072       // The back-edge branch will remain, as will all scalar branches.
7073       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7074     else
7075       // This branch will be eliminated by if-conversion.
7076       return 0;
7077     // Note: We currently assume zero cost for an unconditional branch inside
7078     // a predicated block since it will become a fall-through, although we
7079     // may decide in the future to call TTI for all branches.
7080   }
7081   case Instruction::PHI: {
7082     auto *Phi = cast<PHINode>(I);
7083 
7084     // First-order recurrences are replaced by vector shuffles inside the loop.
7085     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7086     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7087       return TTI.getShuffleCost(
7088           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7089           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7090 
7091     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7092     // converted into select instructions. We require N - 1 selects per phi
7093     // node, where N is the number of incoming values.
7094     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7095       return (Phi->getNumIncomingValues() - 1) *
7096              TTI.getCmpSelInstrCost(
7097                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7098                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7099                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7100 
7101     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7102   }
7103   case Instruction::UDiv:
7104   case Instruction::SDiv:
7105   case Instruction::URem:
7106   case Instruction::SRem:
7107     // If we have a predicated instruction, it may not be executed for each
7108     // vector lane. Get the scalarization cost and scale this amount by the
7109     // probability of executing the predicated block. If the instruction is not
7110     // predicated, we fall through to the next case.
7111     if (VF.isVector() && isScalarWithPredication(I)) {
7112       unsigned Cost = 0;
7113 
7114       // These instructions have a non-void type, so account for the phi nodes
7115       // that we will create. This cost is likely to be zero. The phi node
7116       // cost, if any, should be scaled by the block probability because it
7117       // models a copy at the end of each predicated block.
7118       Cost += VF.getKnownMinValue() *
7119               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7120 
7121       // The cost of the non-predicated instruction.
7122       Cost += VF.getKnownMinValue() *
7123               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7124 
7125       // The cost of insertelement and extractelement instructions needed for
7126       // scalarization.
7127       Cost += getScalarizationOverhead(I, VF);
7128 
7129       // Scale the cost by the probability of executing the predicated blocks.
7130       // This assumes the predicated block for each vector lane is equally
7131       // likely.
7132       return Cost / getReciprocalPredBlockProb();
7133     }
7134     LLVM_FALLTHROUGH;
7135   case Instruction::Add:
7136   case Instruction::FAdd:
7137   case Instruction::Sub:
7138   case Instruction::FSub:
7139   case Instruction::Mul:
7140   case Instruction::FMul:
7141   case Instruction::FDiv:
7142   case Instruction::FRem:
7143   case Instruction::Shl:
7144   case Instruction::LShr:
7145   case Instruction::AShr:
7146   case Instruction::And:
7147   case Instruction::Or:
7148   case Instruction::Xor: {
7149     // Since we will replace the stride by 1 the multiplication should go away.
7150     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7151       return 0;
7152     // Certain instructions can be cheaper to vectorize if they have a constant
7153     // second vector operand. One example of this are shifts on x86.
7154     Value *Op2 = I->getOperand(1);
7155     TargetTransformInfo::OperandValueProperties Op2VP;
7156     TargetTransformInfo::OperandValueKind Op2VK =
7157         TTI.getOperandInfo(Op2, Op2VP);
7158     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7159       Op2VK = TargetTransformInfo::OK_UniformValue;
7160 
7161     SmallVector<const Value *, 4> Operands(I->operand_values());
7162     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7163     return N * TTI.getArithmeticInstrCost(
7164                    I->getOpcode(), VectorTy, CostKind,
7165                    TargetTransformInfo::OK_AnyValue,
7166                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7167   }
7168   case Instruction::FNeg: {
7169     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7170     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7171     return N * TTI.getArithmeticInstrCost(
7172                    I->getOpcode(), VectorTy, CostKind,
7173                    TargetTransformInfo::OK_AnyValue,
7174                    TargetTransformInfo::OK_AnyValue,
7175                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
7176                    I->getOperand(0), I);
7177   }
7178   case Instruction::Select: {
7179     SelectInst *SI = cast<SelectInst>(I);
7180     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7181     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7182     Type *CondTy = SI->getCondition()->getType();
7183     if (!ScalarCond) {
7184       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7185       CondTy = VectorType::get(CondTy, VF);
7186     }
7187     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7188                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7189   }
7190   case Instruction::ICmp:
7191   case Instruction::FCmp: {
7192     Type *ValTy = I->getOperand(0)->getType();
7193     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7194     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7195       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7196     VectorTy = ToVectorTy(ValTy, VF);
7197     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7198                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7199   }
7200   case Instruction::Store:
7201   case Instruction::Load: {
7202     ElementCount Width = VF;
7203     if (Width.isVector()) {
7204       InstWidening Decision = getWideningDecision(I, Width);
7205       assert(Decision != CM_Unknown &&
7206              "CM decision should be taken at this point");
7207       if (Decision == CM_Scalarize)
7208         Width = ElementCount::getFixed(1);
7209     }
7210     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7211     return getMemoryInstructionCost(I, VF);
7212   }
7213   case Instruction::ZExt:
7214   case Instruction::SExt:
7215   case Instruction::FPToUI:
7216   case Instruction::FPToSI:
7217   case Instruction::FPExt:
7218   case Instruction::PtrToInt:
7219   case Instruction::IntToPtr:
7220   case Instruction::SIToFP:
7221   case Instruction::UIToFP:
7222   case Instruction::Trunc:
7223   case Instruction::FPTrunc:
7224   case Instruction::BitCast: {
7225     // Computes the CastContextHint from a Load/Store instruction.
7226     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7227       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7228              "Expected a load or a store!");
7229 
7230       if (VF.isScalar() || !TheLoop->contains(I))
7231         return TTI::CastContextHint::Normal;
7232 
7233       switch (getWideningDecision(I, VF)) {
7234       case LoopVectorizationCostModel::CM_GatherScatter:
7235         return TTI::CastContextHint::GatherScatter;
7236       case LoopVectorizationCostModel::CM_Interleave:
7237         return TTI::CastContextHint::Interleave;
7238       case LoopVectorizationCostModel::CM_Scalarize:
7239       case LoopVectorizationCostModel::CM_Widen:
7240         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7241                                         : TTI::CastContextHint::Normal;
7242       case LoopVectorizationCostModel::CM_Widen_Reverse:
7243         return TTI::CastContextHint::Reversed;
7244       case LoopVectorizationCostModel::CM_Unknown:
7245         llvm_unreachable("Instr did not go through cost modelling?");
7246       }
7247 
7248       llvm_unreachable("Unhandled case!");
7249     };
7250 
7251     unsigned Opcode = I->getOpcode();
7252     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7253     // For Trunc, the context is the only user, which must be a StoreInst.
7254     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7255       if (I->hasOneUse())
7256         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7257           CCH = ComputeCCH(Store);
7258     }
7259     // For Z/Sext, the context is the operand, which must be a LoadInst.
7260     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7261              Opcode == Instruction::FPExt) {
7262       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7263         CCH = ComputeCCH(Load);
7264     }
7265 
7266     // We optimize the truncation of induction variables having constant
7267     // integer steps. The cost of these truncations is the same as the scalar
7268     // operation.
7269     if (isOptimizableIVTruncate(I, VF)) {
7270       auto *Trunc = cast<TruncInst>(I);
7271       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7272                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7273     }
7274 
7275     Type *SrcScalarTy = I->getOperand(0)->getType();
7276     Type *SrcVecTy =
7277         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7278     if (canTruncateToMinimalBitwidth(I, VF)) {
7279       // This cast is going to be shrunk. This may remove the cast or it might
7280       // turn it into slightly different cast. For example, if MinBW == 16,
7281       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7282       //
7283       // Calculate the modified src and dest types.
7284       Type *MinVecTy = VectorTy;
7285       if (Opcode == Instruction::Trunc) {
7286         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7287         VectorTy =
7288             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7289       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7290         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7291         VectorTy =
7292             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7293       }
7294     }
7295 
7296     assert(!VF.isScalable() && "VF is assumed to be non scalable");
7297     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7298     return N *
7299            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7300   }
7301   case Instruction::Call: {
7302     bool NeedToScalarize;
7303     CallInst *CI = cast<CallInst>(I);
7304     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7305     if (getVectorIntrinsicIDForCall(CI, TLI))
7306       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
7307     return CallCost;
7308   }
7309   case Instruction::ExtractValue:
7310     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7311   default:
7312     // The cost of executing VF copies of the scalar instruction. This opcode
7313     // is unknown. Assume that it is the same as 'mul'.
7314     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
7315                                        Instruction::Mul, VectorTy, CostKind) +
7316            getScalarizationOverhead(I, VF);
7317   } // end of switch.
7318 }
7319 
7320 char LoopVectorize::ID = 0;
7321 
7322 static const char lv_name[] = "Loop Vectorization";
7323 
7324 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7325 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7326 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7327 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7328 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7329 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7330 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7331 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7332 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7333 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7334 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7335 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7336 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7337 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7338 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7339 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7340 
7341 namespace llvm {
7342 
7343 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7344 
7345 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7346                               bool VectorizeOnlyWhenForced) {
7347   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7348 }
7349 
7350 } // end namespace llvm
7351 
7352 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7353   // Check if the pointer operand of a load or store instruction is
7354   // consecutive.
7355   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7356     return Legal->isConsecutivePtr(Ptr);
7357   return false;
7358 }
7359 
7360 void LoopVectorizationCostModel::collectValuesToIgnore() {
7361   // Ignore ephemeral values.
7362   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7363 
7364   // Ignore type-promoting instructions we identified during reduction
7365   // detection.
7366   for (auto &Reduction : Legal->getReductionVars()) {
7367     RecurrenceDescriptor &RedDes = Reduction.second;
7368     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7369     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7370   }
7371   // Ignore type-casting instructions we identified during induction
7372   // detection.
7373   for (auto &Induction : Legal->getInductionVars()) {
7374     InductionDescriptor &IndDes = Induction.second;
7375     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7376     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7377   }
7378 }
7379 
7380 void LoopVectorizationCostModel::collectInLoopReductions() {
7381   for (auto &Reduction : Legal->getReductionVars()) {
7382     PHINode *Phi = Reduction.first;
7383     RecurrenceDescriptor &RdxDesc = Reduction.second;
7384 
7385     // We don't collect reductions that are type promoted (yet).
7386     if (RdxDesc.getRecurrenceType() != Phi->getType())
7387       continue;
7388 
7389     // If the target would prefer this reduction to happen "in-loop", then we
7390     // want to record it as such.
7391     unsigned Opcode = RdxDesc.getOpcode();
7392     if (!PreferInLoopReductions &&
7393         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7394                                    TargetTransformInfo::ReductionFlags()))
7395       continue;
7396 
7397     // Check that we can correctly put the reductions into the loop, by
7398     // finding the chain of operations that leads from the phi to the loop
7399     // exit value.
7400     SmallVector<Instruction *, 4> ReductionOperations =
7401         RdxDesc.getReductionOpChain(Phi, TheLoop);
7402     bool InLoop = !ReductionOperations.empty();
7403     if (InLoop)
7404       InLoopReductionChains[Phi] = ReductionOperations;
7405     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7406                       << " reduction for phi: " << *Phi << "\n");
7407   }
7408 }
7409 
7410 // TODO: we could return a pair of values that specify the max VF and
7411 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7412 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7413 // doesn't have a cost model that can choose which plan to execute if
7414 // more than one is generated.
7415 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7416                                  LoopVectorizationCostModel &CM) {
7417   unsigned WidestType;
7418   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7419   return WidestVectorRegBits / WidestType;
7420 }
7421 
7422 VectorizationFactor
7423 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7424   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7425   ElementCount VF = UserVF;
7426   // Outer loop handling: They may require CFG and instruction level
7427   // transformations before even evaluating whether vectorization is profitable.
7428   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7429   // the vectorization pipeline.
7430   if (!OrigLoop->isInnermost()) {
7431     // If the user doesn't provide a vectorization factor, determine a
7432     // reasonable one.
7433     if (UserVF.isZero()) {
7434       VF = ElementCount::getFixed(
7435           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
7436       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7437 
7438       // Make sure we have a VF > 1 for stress testing.
7439       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7440         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7441                           << "overriding computed VF.\n");
7442         VF = ElementCount::getFixed(4);
7443       }
7444     }
7445     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7446     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7447            "VF needs to be a power of two");
7448     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7449                       << "VF " << VF << " to build VPlans.\n");
7450     buildVPlans(VF, VF);
7451 
7452     // For VPlan build stress testing, we bail out after VPlan construction.
7453     if (VPlanBuildStressTest)
7454       return VectorizationFactor::Disabled();
7455 
7456     return {VF, 0 /*Cost*/};
7457   }
7458 
7459   LLVM_DEBUG(
7460       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7461                 "VPlan-native path.\n");
7462   return VectorizationFactor::Disabled();
7463 }
7464 
7465 Optional<VectorizationFactor>
7466 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7467   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7468   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7469   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7470     return None;
7471 
7472   // Invalidate interleave groups if all blocks of loop will be predicated.
7473   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7474       !useMaskedInterleavedAccesses(*TTI)) {
7475     LLVM_DEBUG(
7476         dbgs()
7477         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7478            "which requires masked-interleaved support.\n");
7479     if (CM.InterleaveInfo.invalidateGroups())
7480       // Invalidating interleave groups also requires invalidating all decisions
7481       // based on them, which includes widening decisions and uniform and scalar
7482       // values.
7483       CM.invalidateCostModelingDecisions();
7484   }
7485 
7486   ElementCount MaxVF = MaybeMaxVF.getValue();
7487   assert(MaxVF.isNonZero() && "MaxVF is zero.");
7488 
7489   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
7490   if (!UserVF.isZero() &&
7491       (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
7492     // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
7493     // VFs here, this should be reverted to only use legal UserVFs once the
7494     // loop below supports scalable VFs.
7495     ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
7496     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
7497                       << " VF " << VF << ".\n");
7498     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7499            "VF needs to be a power of two");
7500     // Collect the instructions (and their associated costs) that will be more
7501     // profitable to scalarize.
7502     CM.selectUserVectorizationFactor(VF);
7503     CM.collectInLoopReductions();
7504     buildVPlansWithVPRecipes(VF, VF);
7505     LLVM_DEBUG(printPlans(dbgs()));
7506     return {{VF, 0}};
7507   }
7508 
7509   assert(!MaxVF.isScalable() &&
7510          "Scalable vectors not yet supported beyond this point");
7511 
7512   for (ElementCount VF = ElementCount::getFixed(1);
7513        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7514     // Collect Uniform and Scalar instructions after vectorization with VF.
7515     CM.collectUniformsAndScalars(VF);
7516 
7517     // Collect the instructions (and their associated costs) that will be more
7518     // profitable to scalarize.
7519     if (VF.isVector())
7520       CM.collectInstsToScalarize(VF);
7521   }
7522 
7523   CM.collectInLoopReductions();
7524 
7525   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7526   LLVM_DEBUG(printPlans(dbgs()));
7527   if (MaxVF.isScalar())
7528     return VectorizationFactor::Disabled();
7529 
7530   // Select the optimal vectorization factor.
7531   return CM.selectVectorizationFactor(MaxVF);
7532 }
7533 
7534 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7535   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7536                     << '\n');
7537   BestVF = VF;
7538   BestUF = UF;
7539 
7540   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7541     return !Plan->hasVF(VF);
7542   });
7543   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7544 }
7545 
7546 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7547                                            DominatorTree *DT) {
7548   // Perform the actual loop transformation.
7549 
7550   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7551   VPCallbackILV CallbackILV(ILV);
7552 
7553   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7554 
7555   VPTransformState State{*BestVF, BestUF,      LI,
7556                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7557                          &ILV,    CallbackILV};
7558   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7559   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7560   State.CanonicalIV = ILV.Induction;
7561 
7562   ILV.printDebugTracesAtStart();
7563 
7564   //===------------------------------------------------===//
7565   //
7566   // Notice: any optimization or new instruction that go
7567   // into the code below should also be implemented in
7568   // the cost-model.
7569   //
7570   //===------------------------------------------------===//
7571 
7572   // 2. Copy and widen instructions from the old loop into the new loop.
7573   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7574   VPlans.front()->execute(&State);
7575 
7576   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7577   //    predication, updating analyses.
7578   ILV.fixVectorizedLoop();
7579 
7580   ILV.printDebugTracesAtEnd();
7581 }
7582 
7583 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7584     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7585 
7586   // We create new control-flow for the vectorized loop, so the original exit
7587   // conditions will be dead after vectorization if it's only used by the
7588   // terminator
7589   SmallVector<BasicBlock*> ExitingBlocks;
7590   OrigLoop->getExitingBlocks(ExitingBlocks);
7591   for (auto *BB : ExitingBlocks) {
7592     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7593     if (!Cmp || !Cmp->hasOneUse())
7594       continue;
7595 
7596     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7597     if (!DeadInstructions.insert(Cmp).second)
7598       continue;
7599 
7600     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7601     // TODO: can recurse through operands in general
7602     for (Value *Op : Cmp->operands()) {
7603       if (isa<TruncInst>(Op) && Op->hasOneUse())
7604           DeadInstructions.insert(cast<Instruction>(Op));
7605     }
7606   }
7607 
7608   // We create new "steps" for induction variable updates to which the original
7609   // induction variables map. An original update instruction will be dead if
7610   // all its users except the induction variable are dead.
7611   auto *Latch = OrigLoop->getLoopLatch();
7612   for (auto &Induction : Legal->getInductionVars()) {
7613     PHINode *Ind = Induction.first;
7614     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7615 
7616     // If the tail is to be folded by masking, the primary induction variable,
7617     // if exists, isn't dead: it will be used for masking. Don't kill it.
7618     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7619       continue;
7620 
7621     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7622           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7623         }))
7624       DeadInstructions.insert(IndUpdate);
7625 
7626     // We record as "Dead" also the type-casting instructions we had identified
7627     // during induction analysis. We don't need any handling for them in the
7628     // vectorized loop because we have proven that, under a proper runtime
7629     // test guarding the vectorized loop, the value of the phi, and the casted
7630     // value of the phi, are the same. The last instruction in this casting chain
7631     // will get its scalar/vector/widened def from the scalar/vector/widened def
7632     // of the respective phi node. Any other casts in the induction def-use chain
7633     // have no other uses outside the phi update chain, and will be ignored.
7634     InductionDescriptor &IndDes = Induction.second;
7635     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7636     DeadInstructions.insert(Casts.begin(), Casts.end());
7637   }
7638 }
7639 
7640 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7641 
7642 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7643 
7644 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7645                                         Instruction::BinaryOps BinOp) {
7646   // When unrolling and the VF is 1, we only need to add a simple scalar.
7647   Type *Ty = Val->getType();
7648   assert(!Ty->isVectorTy() && "Val must be a scalar");
7649 
7650   if (Ty->isFloatingPointTy()) {
7651     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7652 
7653     // Floating point operations had to be 'fast' to enable the unrolling.
7654     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7655     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7656   }
7657   Constant *C = ConstantInt::get(Ty, StartIdx);
7658   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7659 }
7660 
7661 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7662   SmallVector<Metadata *, 4> MDs;
7663   // Reserve first location for self reference to the LoopID metadata node.
7664   MDs.push_back(nullptr);
7665   bool IsUnrollMetadata = false;
7666   MDNode *LoopID = L->getLoopID();
7667   if (LoopID) {
7668     // First find existing loop unrolling disable metadata.
7669     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7670       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7671       if (MD) {
7672         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7673         IsUnrollMetadata =
7674             S && S->getString().startswith("llvm.loop.unroll.disable");
7675       }
7676       MDs.push_back(LoopID->getOperand(i));
7677     }
7678   }
7679 
7680   if (!IsUnrollMetadata) {
7681     // Add runtime unroll disable metadata.
7682     LLVMContext &Context = L->getHeader()->getContext();
7683     SmallVector<Metadata *, 1> DisableOperands;
7684     DisableOperands.push_back(
7685         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7686     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7687     MDs.push_back(DisableNode);
7688     MDNode *NewLoopID = MDNode::get(Context, MDs);
7689     // Set operand 0 to refer to the loop id itself.
7690     NewLoopID->replaceOperandWith(0, NewLoopID);
7691     L->setLoopID(NewLoopID);
7692   }
7693 }
7694 
7695 //===--------------------------------------------------------------------===//
7696 // EpilogueVectorizerMainLoop
7697 //===--------------------------------------------------------------------===//
7698 
7699 /// This function is partially responsible for generating the control flow
7700 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7701 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7702   MDNode *OrigLoopID = OrigLoop->getLoopID();
7703   Loop *Lp = createVectorLoopSkeleton("");
7704 
7705   // Generate the code to check the minimum iteration count of the vector
7706   // epilogue (see below).
7707   EPI.EpilogueIterationCountCheck =
7708       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
7709   EPI.EpilogueIterationCountCheck->setName("iter.check");
7710 
7711   // Generate the code to check any assumptions that we've made for SCEV
7712   // expressions.
7713   BasicBlock *SavedPreHeader = LoopVectorPreHeader;
7714   emitSCEVChecks(Lp, LoopScalarPreHeader);
7715 
7716   // If a safety check was generated save it.
7717   if (SavedPreHeader != LoopVectorPreHeader)
7718     EPI.SCEVSafetyCheck = SavedPreHeader;
7719 
7720   // Generate the code that checks at runtime if arrays overlap. We put the
7721   // checks into a separate block to make the more common case of few elements
7722   // faster.
7723   SavedPreHeader = LoopVectorPreHeader;
7724   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
7725 
7726   // If a safety check was generated save/overwite it.
7727   if (SavedPreHeader != LoopVectorPreHeader)
7728     EPI.MemSafetyCheck = SavedPreHeader;
7729 
7730   // Generate the iteration count check for the main loop, *after* the check
7731   // for the epilogue loop, so that the path-length is shorter for the case
7732   // that goes directly through the vector epilogue. The longer-path length for
7733   // the main loop is compensated for, by the gain from vectorizing the larger
7734   // trip count. Note: the branch will get updated later on when we vectorize
7735   // the epilogue.
7736   EPI.MainLoopIterationCountCheck =
7737       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
7738 
7739   // Generate the induction variable.
7740   OldInduction = Legal->getPrimaryInduction();
7741   Type *IdxTy = Legal->getWidestInductionType();
7742   Value *StartIdx = ConstantInt::get(IdxTy, 0);
7743   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7744   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7745   EPI.VectorTripCount = CountRoundDown;
7746   Induction =
7747       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7748                               getDebugLocFromInstOrOperands(OldInduction));
7749 
7750   // Skip induction resume value creation here because they will be created in
7751   // the second pass. If we created them here, they wouldn't be used anyway,
7752   // because the vplan in the second pass still contains the inductions from the
7753   // original loop.
7754 
7755   return completeLoopSkeleton(Lp, OrigLoopID);
7756 }
7757 
7758 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7759   LLVM_DEBUG({
7760     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7761            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7762            << ", Main Loop UF:" << EPI.MainLoopUF
7763            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7764            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7765   });
7766 }
7767 
7768 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7769   DEBUG_WITH_TYPE(VerboseDebug, {
7770     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
7771   });
7772 }
7773 
7774 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
7775     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
7776   assert(L && "Expected valid Loop.");
7777   assert(Bypass && "Expected valid bypass basic block.");
7778   unsigned VFactor =
7779       ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
7780   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7781   Value *Count = getOrCreateTripCount(L);
7782   // Reuse existing vector loop preheader for TC checks.
7783   // Note that new preheader block is generated for vector loop.
7784   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7785   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7786 
7787   // Generate code to check if the loop's trip count is less than VF * UF of the
7788   // main vector loop.
7789   auto P =
7790       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7791 
7792   Value *CheckMinIters = Builder.CreateICmp(
7793       P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
7794       "min.iters.check");
7795 
7796   if (!ForEpilogue)
7797     TCCheckBlock->setName("vector.main.loop.iter.check");
7798 
7799   // Create new preheader for vector loop.
7800   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7801                                    DT, LI, nullptr, "vector.ph");
7802 
7803   if (ForEpilogue) {
7804     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7805                                  DT->getNode(Bypass)->getIDom()) &&
7806            "TC check is expected to dominate Bypass");
7807 
7808     // Update dominator for Bypass & LoopExit.
7809     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7810     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7811 
7812     LoopBypassBlocks.push_back(TCCheckBlock);
7813 
7814     // Save the trip count so we don't have to regenerate it in the
7815     // vec.epilog.iter.check. This is safe to do because the trip count
7816     // generated here dominates the vector epilog iter check.
7817     EPI.TripCount = Count;
7818   }
7819 
7820   ReplaceInstWithInst(
7821       TCCheckBlock->getTerminator(),
7822       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7823 
7824   return TCCheckBlock;
7825 }
7826 
7827 //===--------------------------------------------------------------------===//
7828 // EpilogueVectorizerEpilogueLoop
7829 //===--------------------------------------------------------------------===//
7830 
7831 /// This function is partially responsible for generating the control flow
7832 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7833 BasicBlock *
7834 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7835   MDNode *OrigLoopID = OrigLoop->getLoopID();
7836   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
7837 
7838   // Now, compare the remaining count and if there aren't enough iterations to
7839   // execute the vectorized epilogue skip to the scalar part.
7840   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7841   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7842   LoopVectorPreHeader =
7843       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7844                  LI, nullptr, "vec.epilog.ph");
7845   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
7846                                           VecEpilogueIterationCountCheck);
7847 
7848   // Adjust the control flow taking the state info from the main loop
7849   // vectorization into account.
7850   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7851          "expected this to be saved from the previous pass.");
7852   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7853       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7854 
7855   DT->changeImmediateDominator(LoopVectorPreHeader,
7856                                EPI.MainLoopIterationCountCheck);
7857 
7858   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7859       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7860 
7861   if (EPI.SCEVSafetyCheck)
7862     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7863         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7864   if (EPI.MemSafetyCheck)
7865     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7866         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7867 
7868   DT->changeImmediateDominator(
7869       VecEpilogueIterationCountCheck,
7870       VecEpilogueIterationCountCheck->getSinglePredecessor());
7871 
7872   DT->changeImmediateDominator(LoopScalarPreHeader,
7873                                EPI.EpilogueIterationCountCheck);
7874   DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
7875 
7876   // Keep track of bypass blocks, as they feed start values to the induction
7877   // phis in the scalar loop preheader.
7878   if (EPI.SCEVSafetyCheck)
7879     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7880   if (EPI.MemSafetyCheck)
7881     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7882   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7883 
7884   // Generate a resume induction for the vector epilogue and put it in the
7885   // vector epilogue preheader
7886   Type *IdxTy = Legal->getWidestInductionType();
7887   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7888                                          LoopVectorPreHeader->getFirstNonPHI());
7889   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7890   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7891                            EPI.MainLoopIterationCountCheck);
7892 
7893   // Generate the induction variable.
7894   OldInduction = Legal->getPrimaryInduction();
7895   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7896   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7897   Value *StartIdx = EPResumeVal;
7898   Induction =
7899       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7900                               getDebugLocFromInstOrOperands(OldInduction));
7901 
7902   // Generate induction resume values. These variables save the new starting
7903   // indexes for the scalar loop. They are used to test if there are any tail
7904   // iterations left once the vector loop has completed.
7905   // Note that when the vectorized epilogue is skipped due to iteration count
7906   // check, then the resume value for the induction variable comes from
7907   // the trip count of the main vector loop, hence passing the AdditionalBypass
7908   // argument.
7909   createInductionResumeValues(Lp, CountRoundDown,
7910                               {VecEpilogueIterationCountCheck,
7911                                EPI.VectorTripCount} /* AdditionalBypass */);
7912 
7913   AddRuntimeUnrollDisableMetaData(Lp);
7914   return completeLoopSkeleton(Lp, OrigLoopID);
7915 }
7916 
7917 BasicBlock *
7918 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7919     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
7920 
7921   assert(EPI.TripCount &&
7922          "Expected trip count to have been safed in the first pass.");
7923   assert(
7924       (!isa<Instruction>(EPI.TripCount) ||
7925        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7926       "saved trip count does not dominate insertion point.");
7927   Value *TC = EPI.TripCount;
7928   IRBuilder<> Builder(Insert->getTerminator());
7929   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7930 
7931   // Generate code to check if the loop's trip count is less than VF * UF of the
7932   // vector epilogue loop.
7933   auto P =
7934       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7935 
7936   Value *CheckMinIters = Builder.CreateICmp(
7937       P, Count,
7938       ConstantInt::get(Count->getType(),
7939                        EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
7940       "min.epilog.iters.check");
7941 
7942   ReplaceInstWithInst(
7943       Insert->getTerminator(),
7944       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7945 
7946   LoopBypassBlocks.push_back(Insert);
7947   return Insert;
7948 }
7949 
7950 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7951   LLVM_DEBUG({
7952     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7953            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7954            << ", Main Loop UF:" << EPI.MainLoopUF
7955            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7956            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7957   });
7958 }
7959 
7960 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7961   DEBUG_WITH_TYPE(VerboseDebug, {
7962     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
7963   });
7964 }
7965 
7966 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7967     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7968   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7969   bool PredicateAtRangeStart = Predicate(Range.Start);
7970 
7971   for (ElementCount TmpVF = Range.Start * 2;
7972        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7973     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7974       Range.End = TmpVF;
7975       break;
7976     }
7977 
7978   return PredicateAtRangeStart;
7979 }
7980 
7981 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7982 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7983 /// of VF's starting at a given VF and extending it as much as possible. Each
7984 /// vectorization decision can potentially shorten this sub-range during
7985 /// buildVPlan().
7986 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7987                                            ElementCount MaxVF) {
7988   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7989   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7990     VFRange SubRange = {VF, MaxVFPlusOne};
7991     VPlans.push_back(buildVPlan(SubRange));
7992     VF = SubRange.End;
7993   }
7994 }
7995 
7996 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7997                                          VPlanPtr &Plan) {
7998   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7999 
8000   // Look for cached value.
8001   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8002   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8003   if (ECEntryIt != EdgeMaskCache.end())
8004     return ECEntryIt->second;
8005 
8006   VPValue *SrcMask = createBlockInMask(Src, Plan);
8007 
8008   // The terminator has to be a branch inst!
8009   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8010   assert(BI && "Unexpected terminator found");
8011 
8012   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8013     return EdgeMaskCache[Edge] = SrcMask;
8014 
8015   // If source is an exiting block, we know the exit edge is dynamically dead
8016   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8017   // adding uses of an otherwise potentially dead instruction.
8018   if (OrigLoop->isLoopExiting(Src))
8019     return EdgeMaskCache[Edge] = SrcMask;
8020 
8021   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8022   assert(EdgeMask && "No Edge Mask found for condition");
8023 
8024   if (BI->getSuccessor(0) != Dst)
8025     EdgeMask = Builder.createNot(EdgeMask);
8026 
8027   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
8028     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
8029 
8030   return EdgeMaskCache[Edge] = EdgeMask;
8031 }
8032 
8033 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8034   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8035 
8036   // Look for cached value.
8037   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8038   if (BCEntryIt != BlockMaskCache.end())
8039     return BCEntryIt->second;
8040 
8041   // All-one mask is modelled as no-mask following the convention for masked
8042   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8043   VPValue *BlockMask = nullptr;
8044 
8045   if (OrigLoop->getHeader() == BB) {
8046     if (!CM.blockNeedsPredication(BB))
8047       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8048 
8049     // Create the block in mask as the first non-phi instruction in the block.
8050     VPBuilder::InsertPointGuard Guard(Builder);
8051     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8052     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8053 
8054     // Introduce the early-exit compare IV <= BTC to form header block mask.
8055     // This is used instead of IV < TC because TC may wrap, unlike BTC.
8056     // Start by constructing the desired canonical IV.
8057     VPValue *IV = nullptr;
8058     if (Legal->getPrimaryInduction())
8059       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8060     else {
8061       auto IVRecipe = new VPWidenCanonicalIVRecipe();
8062       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
8063       IV = IVRecipe->getVPValue();
8064     }
8065     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8066     bool TailFolded = !CM.isScalarEpilogueAllowed();
8067 
8068     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8069       // While ActiveLaneMask is a binary op that consumes the loop tripcount
8070       // as a second argument, we only pass the IV here and extract the
8071       // tripcount from the transform state where codegen of the VP instructions
8072       // happen.
8073       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8074     } else {
8075       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8076     }
8077     return BlockMaskCache[BB] = BlockMask;
8078   }
8079 
8080   // This is the block mask. We OR all incoming edges.
8081   for (auto *Predecessor : predecessors(BB)) {
8082     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8083     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8084       return BlockMaskCache[BB] = EdgeMask;
8085 
8086     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8087       BlockMask = EdgeMask;
8088       continue;
8089     }
8090 
8091     BlockMask = Builder.createOr(BlockMask, EdgeMask);
8092   }
8093 
8094   return BlockMaskCache[BB] = BlockMask;
8095 }
8096 
8097 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
8098                                                 VPlanPtr &Plan) {
8099   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8100          "Must be called with either a load or store");
8101 
8102   auto willWiden = [&](ElementCount VF) -> bool {
8103     if (VF.isScalar())
8104       return false;
8105     LoopVectorizationCostModel::InstWidening Decision =
8106         CM.getWideningDecision(I, VF);
8107     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8108            "CM decision should be taken at this point.");
8109     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8110       return true;
8111     if (CM.isScalarAfterVectorization(I, VF) ||
8112         CM.isProfitableToScalarize(I, VF))
8113       return false;
8114     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8115   };
8116 
8117   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8118     return nullptr;
8119 
8120   VPValue *Mask = nullptr;
8121   if (Legal->isMaskRequired(I))
8122     Mask = createBlockInMask(I->getParent(), Plan);
8123 
8124   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
8125   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8126     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
8127 
8128   StoreInst *Store = cast<StoreInst>(I);
8129   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
8130   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
8131 }
8132 
8133 VPWidenIntOrFpInductionRecipe *
8134 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const {
8135   // Check if this is an integer or fp induction. If so, build the recipe that
8136   // produces its scalar and vector values.
8137   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8138   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8139       II.getKind() == InductionDescriptor::IK_FpInduction) {
8140     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8141     return new VPWidenIntOrFpInductionRecipe(Phi, Start);
8142   }
8143 
8144   return nullptr;
8145 }
8146 
8147 VPWidenIntOrFpInductionRecipe *
8148 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
8149                                                 VPlan &Plan) const {
8150   // Optimize the special case where the source is a constant integer
8151   // induction variable. Notice that we can only optimize the 'trunc' case
8152   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8153   // (c) other casts depend on pointer size.
8154 
8155   // Determine whether \p K is a truncation based on an induction variable that
8156   // can be optimized.
8157   auto isOptimizableIVTruncate =
8158       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8159     return [=](ElementCount VF) -> bool {
8160       return CM.isOptimizableIVTruncate(K, VF);
8161     };
8162   };
8163 
8164   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8165           isOptimizableIVTruncate(I), Range)) {
8166 
8167     InductionDescriptor II =
8168         Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
8169     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8170     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8171                                              Start, I);
8172   }
8173   return nullptr;
8174 }
8175 
8176 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
8177   // We know that all PHIs in non-header blocks are converted into selects, so
8178   // we don't have to worry about the insertion order and we can just use the
8179   // builder. At this point we generate the predication tree. There may be
8180   // duplications since this is a simple recursive scan, but future
8181   // optimizations will clean it up.
8182 
8183   SmallVector<VPValue *, 2> Operands;
8184   unsigned NumIncoming = Phi->getNumIncomingValues();
8185   for (unsigned In = 0; In < NumIncoming; In++) {
8186     VPValue *EdgeMask =
8187       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8188     assert((EdgeMask || NumIncoming == 1) &&
8189            "Multiple predecessors with one having a full mask");
8190     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
8191     if (EdgeMask)
8192       Operands.push_back(EdgeMask);
8193   }
8194   return new VPBlendRecipe(Phi, Operands);
8195 }
8196 
8197 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
8198                                                    VPlan &Plan) const {
8199 
8200   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8201       [this, CI](ElementCount VF) {
8202         return CM.isScalarWithPredication(CI, VF);
8203       },
8204       Range);
8205 
8206   if (IsPredicated)
8207     return nullptr;
8208 
8209   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8210   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8211              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8212              ID == Intrinsic::pseudoprobe))
8213     return nullptr;
8214 
8215   auto willWiden = [&](ElementCount VF) -> bool {
8216     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8217     // The following case may be scalarized depending on the VF.
8218     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8219     // version of the instruction.
8220     // Is it beneficial to perform intrinsic call compared to lib call?
8221     bool NeedToScalarize = false;
8222     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8223     bool UseVectorIntrinsic =
8224         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
8225     return UseVectorIntrinsic || !NeedToScalarize;
8226   };
8227 
8228   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8229     return nullptr;
8230 
8231   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
8232 }
8233 
8234 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8235   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8236          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8237   // Instruction should be widened, unless it is scalar after vectorization,
8238   // scalarization is profitable or it is predicated.
8239   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8240     return CM.isScalarAfterVectorization(I, VF) ||
8241            CM.isProfitableToScalarize(I, VF) ||
8242            CM.isScalarWithPredication(I, VF);
8243   };
8244   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8245                                                              Range);
8246 }
8247 
8248 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
8249   auto IsVectorizableOpcode = [](unsigned Opcode) {
8250     switch (Opcode) {
8251     case Instruction::Add:
8252     case Instruction::And:
8253     case Instruction::AShr:
8254     case Instruction::BitCast:
8255     case Instruction::FAdd:
8256     case Instruction::FCmp:
8257     case Instruction::FDiv:
8258     case Instruction::FMul:
8259     case Instruction::FNeg:
8260     case Instruction::FPExt:
8261     case Instruction::FPToSI:
8262     case Instruction::FPToUI:
8263     case Instruction::FPTrunc:
8264     case Instruction::FRem:
8265     case Instruction::FSub:
8266     case Instruction::ICmp:
8267     case Instruction::IntToPtr:
8268     case Instruction::LShr:
8269     case Instruction::Mul:
8270     case Instruction::Or:
8271     case Instruction::PtrToInt:
8272     case Instruction::SDiv:
8273     case Instruction::Select:
8274     case Instruction::SExt:
8275     case Instruction::Shl:
8276     case Instruction::SIToFP:
8277     case Instruction::SRem:
8278     case Instruction::Sub:
8279     case Instruction::Trunc:
8280     case Instruction::UDiv:
8281     case Instruction::UIToFP:
8282     case Instruction::URem:
8283     case Instruction::Xor:
8284     case Instruction::ZExt:
8285       return true;
8286     }
8287     return false;
8288   };
8289 
8290   if (!IsVectorizableOpcode(I->getOpcode()))
8291     return nullptr;
8292 
8293   // Success: widen this instruction.
8294   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
8295 }
8296 
8297 VPBasicBlock *VPRecipeBuilder::handleReplication(
8298     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8299     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
8300     VPlanPtr &Plan) {
8301   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8302       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8303       Range);
8304 
8305   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8306       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
8307       Range);
8308 
8309   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8310                                        IsUniform, IsPredicated);
8311   setRecipe(I, Recipe);
8312   Plan->addVPValue(I, Recipe);
8313 
8314   // Find if I uses a predicated instruction. If so, it will use its scalar
8315   // value. Avoid hoisting the insert-element which packs the scalar value into
8316   // a vector value, as that happens iff all users use the vector value.
8317   for (auto &Op : I->operands())
8318     if (auto *PredInst = dyn_cast<Instruction>(Op))
8319       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
8320         PredInst2Recipe[PredInst]->setAlsoPack(false);
8321 
8322   // Finalize the recipe for Instr, first if it is not predicated.
8323   if (!IsPredicated) {
8324     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8325     VPBB->appendRecipe(Recipe);
8326     return VPBB;
8327   }
8328   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8329   assert(VPBB->getSuccessors().empty() &&
8330          "VPBB has successors when handling predicated replication.");
8331   // Record predicated instructions for above packing optimizations.
8332   PredInst2Recipe[I] = Recipe;
8333   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8334   VPBlockUtils::insertBlockAfter(Region, VPBB);
8335   auto *RegSucc = new VPBasicBlock();
8336   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8337   return RegSucc;
8338 }
8339 
8340 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8341                                                       VPRecipeBase *PredRecipe,
8342                                                       VPlanPtr &Plan) {
8343   // Instructions marked for predication are replicated and placed under an
8344   // if-then construct to prevent side-effects.
8345 
8346   // Generate recipes to compute the block mask for this region.
8347   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8348 
8349   // Build the triangular if-then region.
8350   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8351   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8352   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8353   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8354   auto *PHIRecipe = Instr->getType()->isVoidTy()
8355                         ? nullptr
8356                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8357   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8358   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8359   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8360 
8361   // Note: first set Entry as region entry and then connect successors starting
8362   // from it in order, to propagate the "parent" of each VPBasicBlock.
8363   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8364   VPBlockUtils::connectBlocks(Pred, Exit);
8365 
8366   return Region;
8367 }
8368 
8369 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8370                                                       VFRange &Range,
8371                                                       VPlanPtr &Plan) {
8372   // First, check for specific widening recipes that deal with calls, memory
8373   // operations, inductions and Phi nodes.
8374   if (auto *CI = dyn_cast<CallInst>(Instr))
8375     return tryToWidenCall(CI, Range, *Plan);
8376 
8377   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8378     return tryToWidenMemory(Instr, Range, Plan);
8379 
8380   VPRecipeBase *Recipe;
8381   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8382     if (Phi->getParent() != OrigLoop->getHeader())
8383       return tryToBlend(Phi, Plan);
8384     if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan)))
8385       return Recipe;
8386 
8387     if (Legal->isReductionVariable(Phi)) {
8388       RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8389       VPValue *StartV =
8390           Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue());
8391       return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV);
8392     }
8393 
8394     return new VPWidenPHIRecipe(Phi);
8395   }
8396 
8397   if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8398                                     cast<TruncInst>(Instr), Range, *Plan)))
8399     return Recipe;
8400 
8401   if (!shouldWiden(Instr, Range))
8402     return nullptr;
8403 
8404   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8405     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
8406                                 OrigLoop);
8407 
8408   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8409     bool InvariantCond =
8410         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8411     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
8412                                    InvariantCond);
8413   }
8414 
8415   return tryToWiden(Instr, *Plan);
8416 }
8417 
8418 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8419                                                         ElementCount MaxVF) {
8420   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8421 
8422   // Collect instructions from the original loop that will become trivially dead
8423   // in the vectorized loop. We don't need to vectorize these instructions. For
8424   // example, original induction update instructions can become dead because we
8425   // separately emit induction "steps" when generating code for the new loop.
8426   // Similarly, we create a new latch condition when setting up the structure
8427   // of the new loop, so the old one can become dead.
8428   SmallPtrSet<Instruction *, 4> DeadInstructions;
8429   collectTriviallyDeadInstructions(DeadInstructions);
8430 
8431   // Add assume instructions we need to drop to DeadInstructions, to prevent
8432   // them from being added to the VPlan.
8433   // TODO: We only need to drop assumes in blocks that get flattend. If the
8434   // control flow is preserved, we should keep them.
8435   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8436   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8437 
8438   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8439   // Dead instructions do not need sinking. Remove them from SinkAfter.
8440   for (Instruction *I : DeadInstructions)
8441     SinkAfter.erase(I);
8442 
8443   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8444   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8445     VFRange SubRange = {VF, MaxVFPlusOne};
8446     VPlans.push_back(
8447         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8448     VF = SubRange.End;
8449   }
8450 }
8451 
8452 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8453     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8454     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
8455 
8456   // Hold a mapping from predicated instructions to their recipes, in order to
8457   // fix their AlsoPack behavior if a user is determined to replicate and use a
8458   // scalar instead of vector value.
8459   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
8460 
8461   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8462 
8463   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8464 
8465   // ---------------------------------------------------------------------------
8466   // Pre-construction: record ingredients whose recipes we'll need to further
8467   // process after constructing the initial VPlan.
8468   // ---------------------------------------------------------------------------
8469 
8470   // Mark instructions we'll need to sink later and their targets as
8471   // ingredients whose recipe we'll need to record.
8472   for (auto &Entry : SinkAfter) {
8473     RecipeBuilder.recordRecipeOf(Entry.first);
8474     RecipeBuilder.recordRecipeOf(Entry.second);
8475   }
8476   for (auto &Reduction : CM.getInLoopReductionChains()) {
8477     PHINode *Phi = Reduction.first;
8478     RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
8479     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8480 
8481     RecipeBuilder.recordRecipeOf(Phi);
8482     for (auto &R : ReductionOperations) {
8483       RecipeBuilder.recordRecipeOf(R);
8484       // For min/max reducitons, where we have a pair of icmp/select, we also
8485       // need to record the ICmp recipe, so it can be removed later.
8486       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8487         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8488     }
8489   }
8490 
8491   // For each interleave group which is relevant for this (possibly trimmed)
8492   // Range, add it to the set of groups to be later applied to the VPlan and add
8493   // placeholders for its members' Recipes which we'll be replacing with a
8494   // single VPInterleaveRecipe.
8495   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8496     auto applyIG = [IG, this](ElementCount VF) -> bool {
8497       return (VF.isVector() && // Query is illegal for VF == 1
8498               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8499                   LoopVectorizationCostModel::CM_Interleave);
8500     };
8501     if (!getDecisionAndClampRange(applyIG, Range))
8502       continue;
8503     InterleaveGroups.insert(IG);
8504     for (unsigned i = 0; i < IG->getFactor(); i++)
8505       if (Instruction *Member = IG->getMember(i))
8506         RecipeBuilder.recordRecipeOf(Member);
8507   };
8508 
8509   // ---------------------------------------------------------------------------
8510   // Build initial VPlan: Scan the body of the loop in a topological order to
8511   // visit each basic block after having visited its predecessor basic blocks.
8512   // ---------------------------------------------------------------------------
8513 
8514   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
8515   auto Plan = std::make_unique<VPlan>();
8516   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
8517   Plan->setEntry(VPBB);
8518 
8519   // Scan the body of the loop in a topological order to visit each basic block
8520   // after having visited its predecessor basic blocks.
8521   LoopBlocksDFS DFS(OrigLoop);
8522   DFS.perform(LI);
8523 
8524   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8525     // Relevant instructions from basic block BB will be grouped into VPRecipe
8526     // ingredients and fill a new VPBasicBlock.
8527     unsigned VPBBsForBB = 0;
8528     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8529     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
8530     VPBB = FirstVPBBForBB;
8531     Builder.setInsertPoint(VPBB);
8532 
8533     // Introduce each ingredient into VPlan.
8534     // TODO: Model and preserve debug instrinsics in VPlan.
8535     for (Instruction &I : BB->instructionsWithoutDebug()) {
8536       Instruction *Instr = &I;
8537 
8538       // First filter out irrelevant instructions, to ensure no recipes are
8539       // built for them.
8540       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8541         continue;
8542 
8543       if (auto Recipe =
8544               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
8545         for (auto *Def : Recipe->definedValues()) {
8546           auto *UV = Def->getUnderlyingValue();
8547           Plan->addVPValue(UV, Def);
8548         }
8549 
8550         RecipeBuilder.setRecipe(Instr, Recipe);
8551         VPBB->appendRecipe(Recipe);
8552         continue;
8553       }
8554 
8555       // Otherwise, if all widening options failed, Instruction is to be
8556       // replicated. This may create a successor for VPBB.
8557       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
8558           Instr, Range, VPBB, PredInst2Recipe, Plan);
8559       if (NextVPBB != VPBB) {
8560         VPBB = NextVPBB;
8561         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8562                                     : "");
8563       }
8564     }
8565   }
8566 
8567   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8568   // may also be empty, such as the last one VPBB, reflecting original
8569   // basic-blocks with no recipes.
8570   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8571   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
8572   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8573   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
8574   delete PreEntry;
8575 
8576   // ---------------------------------------------------------------------------
8577   // Transform initial VPlan: Apply previously taken decisions, in order, to
8578   // bring the VPlan to its final state.
8579   // ---------------------------------------------------------------------------
8580 
8581   // Apply Sink-After legal constraints.
8582   for (auto &Entry : SinkAfter) {
8583     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8584     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8585     // If the target is in a replication region, make sure to move Sink to the
8586     // block after it, not into the replication region itself.
8587     if (auto *Region =
8588             dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) {
8589       if (Region->isReplicator()) {
8590         assert(Region->getNumSuccessors() == 1 && "Expected SESE region!");
8591         VPBasicBlock *NextBlock =
8592             cast<VPBasicBlock>(Region->getSuccessors().front());
8593         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
8594         continue;
8595       }
8596     }
8597     Sink->moveAfter(Target);
8598   }
8599 
8600   // Interleave memory: for each Interleave Group we marked earlier as relevant
8601   // for this VPlan, replace the Recipes widening its memory instructions with a
8602   // single VPInterleaveRecipe at its insertion point.
8603   for (auto IG : InterleaveGroups) {
8604     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8605         RecipeBuilder.getRecipe(IG->getInsertPos()));
8606     SmallVector<VPValue *, 4> StoredValues;
8607     for (unsigned i = 0; i < IG->getFactor(); ++i)
8608       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
8609         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
8610 
8611     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8612                                         Recipe->getMask());
8613     VPIG->insertBefore(Recipe);
8614     unsigned J = 0;
8615     for (unsigned i = 0; i < IG->getFactor(); ++i)
8616       if (Instruction *Member = IG->getMember(i)) {
8617         if (!Member->getType()->isVoidTy()) {
8618           VPValue *OriginalV = Plan->getVPValue(Member);
8619           Plan->removeVPValueFor(Member);
8620           Plan->addVPValue(Member, VPIG->getVPValue(J));
8621           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8622           J++;
8623         }
8624         RecipeBuilder.getRecipe(Member)->eraseFromParent();
8625       }
8626   }
8627 
8628   // Adjust the recipes for any inloop reductions.
8629   if (Range.Start.isVector())
8630     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
8631 
8632   // Finally, if tail is folded by masking, introduce selects between the phi
8633   // and the live-out instruction of each reduction, at the end of the latch.
8634   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
8635     Builder.setInsertPoint(VPBB);
8636     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
8637     for (auto &Reduction : Legal->getReductionVars()) {
8638       if (CM.isInLoopReduction(Reduction.first))
8639         continue;
8640       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
8641       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
8642       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
8643     }
8644   }
8645 
8646   std::string PlanName;
8647   raw_string_ostream RSO(PlanName);
8648   ElementCount VF = Range.Start;
8649   Plan->addVF(VF);
8650   RSO << "Initial VPlan for VF={" << VF;
8651   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
8652     Plan->addVF(VF);
8653     RSO << "," << VF;
8654   }
8655   RSO << "},UF>=1";
8656   RSO.flush();
8657   Plan->setName(PlanName);
8658 
8659   return Plan;
8660 }
8661 
8662 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8663   // Outer loop handling: They may require CFG and instruction level
8664   // transformations before even evaluating whether vectorization is profitable.
8665   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8666   // the vectorization pipeline.
8667   assert(!OrigLoop->isInnermost());
8668   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8669 
8670   // Create new empty VPlan
8671   auto Plan = std::make_unique<VPlan>();
8672 
8673   // Build hierarchical CFG
8674   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8675   HCFGBuilder.buildHierarchicalCFG();
8676 
8677   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
8678        VF *= 2)
8679     Plan->addVF(VF);
8680 
8681   if (EnableVPlanPredication) {
8682     VPlanPredicator VPP(*Plan);
8683     VPP.predicate();
8684 
8685     // Avoid running transformation to recipes until masked code generation in
8686     // VPlan-native path is in place.
8687     return Plan;
8688   }
8689 
8690   SmallPtrSet<Instruction *, 1> DeadInstructions;
8691   VPlanTransforms::VPInstructionsToVPRecipes(
8692       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
8693   return Plan;
8694 }
8695 
8696 // Adjust the recipes for any inloop reductions. The chain of instructions
8697 // leading from the loop exit instr to the phi need to be converted to
8698 // reductions, with one operand being vector and the other being the scalar
8699 // reduction chain.
8700 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
8701     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
8702   for (auto &Reduction : CM.getInLoopReductionChains()) {
8703     PHINode *Phi = Reduction.first;
8704     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8705     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8706 
8707     // ReductionOperations are orders top-down from the phi's use to the
8708     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
8709     // which of the two operands will remain scalar and which will be reduced.
8710     // For minmax the chain will be the select instructions.
8711     Instruction *Chain = Phi;
8712     for (Instruction *R : ReductionOperations) {
8713       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
8714       RecurKind Kind = RdxDesc.getRecurrenceKind();
8715 
8716       VPValue *ChainOp = Plan->getVPValue(Chain);
8717       unsigned FirstOpId;
8718       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8719         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
8720                "Expected to replace a VPWidenSelectSC");
8721         FirstOpId = 1;
8722       } else {
8723         assert(isa<VPWidenRecipe>(WidenRecipe) &&
8724                "Expected to replace a VPWidenSC");
8725         FirstOpId = 0;
8726       }
8727       unsigned VecOpId =
8728           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
8729       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
8730 
8731       auto *CondOp = CM.foldTailByMasking()
8732                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
8733                          : nullptr;
8734       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
8735           &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
8736       WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
8737       Plan->removeVPValueFor(R);
8738       Plan->addVPValue(R, RedRecipe);
8739       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
8740       WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
8741       WidenRecipe->eraseFromParent();
8742 
8743       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8744         VPRecipeBase *CompareRecipe =
8745             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
8746         assert(isa<VPWidenRecipe>(CompareRecipe) &&
8747                "Expected to replace a VPWidenSC");
8748         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
8749                "Expected no remaining users");
8750         CompareRecipe->eraseFromParent();
8751       }
8752       Chain = R;
8753     }
8754   }
8755 }
8756 
8757 Value* LoopVectorizationPlanner::VPCallbackILV::
8758 getOrCreateVectorValues(Value *V, unsigned Part) {
8759       return ILV.getOrCreateVectorValue(V, Part);
8760 }
8761 
8762 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
8763     Value *V, const VPIteration &Instance) {
8764   return ILV.getOrCreateScalarValue(V, Instance);
8765 }
8766 
8767 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
8768                                VPSlotTracker &SlotTracker) const {
8769   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
8770   IG->getInsertPos()->printAsOperand(O, false);
8771   O << ", ";
8772   getAddr()->printAsOperand(O, SlotTracker);
8773   VPValue *Mask = getMask();
8774   if (Mask) {
8775     O << ", ";
8776     Mask->printAsOperand(O, SlotTracker);
8777   }
8778   for (unsigned i = 0; i < IG->getFactor(); ++i)
8779     if (Instruction *I = IG->getMember(i))
8780       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
8781 }
8782 
8783 void VPWidenCallRecipe::execute(VPTransformState &State) {
8784   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
8785                                   *this, State);
8786 }
8787 
8788 void VPWidenSelectRecipe::execute(VPTransformState &State) {
8789   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
8790                                     this, *this, InvariantCond, State);
8791 }
8792 
8793 void VPWidenRecipe::execute(VPTransformState &State) {
8794   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
8795 }
8796 
8797 void VPWidenGEPRecipe::execute(VPTransformState &State) {
8798   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
8799                       *this, State.UF, State.VF, IsPtrLoopInvariant,
8800                       IsIndexLoopInvariant, State);
8801 }
8802 
8803 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
8804   assert(!State.Instance && "Int or FP induction being replicated.");
8805   State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
8806                                    Trunc);
8807 }
8808 
8809 void VPWidenPHIRecipe::execute(VPTransformState &State) {
8810   Value *StartV =
8811       getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr;
8812   State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF);
8813 }
8814 
8815 void VPBlendRecipe::execute(VPTransformState &State) {
8816   State.ILV->setDebugLocFromInst(State.Builder, Phi);
8817   // We know that all PHIs in non-header blocks are converted into
8818   // selects, so we don't have to worry about the insertion order and we
8819   // can just use the builder.
8820   // At this point we generate the predication tree. There may be
8821   // duplications since this is a simple recursive scan, but future
8822   // optimizations will clean it up.
8823 
8824   unsigned NumIncoming = getNumIncomingValues();
8825 
8826   // Generate a sequence of selects of the form:
8827   // SELECT(Mask3, In3,
8828   //        SELECT(Mask2, In2,
8829   //               SELECT(Mask1, In1,
8830   //                      In0)))
8831   // Note that Mask0 is never used: lanes for which no path reaches this phi and
8832   // are essentially undef are taken from In0.
8833   InnerLoopVectorizer::VectorParts Entry(State.UF);
8834   for (unsigned In = 0; In < NumIncoming; ++In) {
8835     for (unsigned Part = 0; Part < State.UF; ++Part) {
8836       // We might have single edge PHIs (blocks) - use an identity
8837       // 'select' for the first PHI operand.
8838       Value *In0 = State.get(getIncomingValue(In), Part);
8839       if (In == 0)
8840         Entry[Part] = In0; // Initialize with the first incoming value.
8841       else {
8842         // Select between the current value and the previous incoming edge
8843         // based on the incoming mask.
8844         Value *Cond = State.get(getMask(In), Part);
8845         Entry[Part] =
8846             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8847       }
8848     }
8849   }
8850   for (unsigned Part = 0; Part < State.UF; ++Part)
8851     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8852 }
8853 
8854 void VPInterleaveRecipe::execute(VPTransformState &State) {
8855   assert(!State.Instance && "Interleave group being replicated.");
8856   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
8857                                       getStoredValues(), getMask());
8858 }
8859 
8860 void VPReductionRecipe::execute(VPTransformState &State) {
8861   assert(!State.Instance && "Reduction being replicated.");
8862   for (unsigned Part = 0; Part < State.UF; ++Part) {
8863     RecurKind Kind = RdxDesc->getRecurrenceKind();
8864     Value *NewVecOp = State.get(getVecOp(), Part);
8865     if (VPValue *Cond = getCondOp()) {
8866       Value *NewCond = State.get(Cond, Part);
8867       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
8868       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
8869           Kind, VecTy->getElementType());
8870       Constant *IdenVec =
8871           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
8872       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
8873       NewVecOp = Select;
8874     }
8875     Value *NewRed =
8876         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
8877     Value *PrevInChain = State.get(getChainOp(), Part);
8878     Value *NextInChain;
8879     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8880       NextInChain =
8881           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
8882                          NewRed, PrevInChain);
8883     } else {
8884       NextInChain = State.Builder.CreateBinOp(
8885           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
8886           PrevInChain);
8887     }
8888     State.set(this, getUnderlyingInstr(), NextInChain, Part);
8889   }
8890 }
8891 
8892 void VPReplicateRecipe::execute(VPTransformState &State) {
8893   if (State.Instance) { // Generate a single instance.
8894     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
8895     State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
8896                                     *State.Instance, IsPredicated, State);
8897     // Insert scalar instance packing it into a vector.
8898     if (AlsoPack && State.VF.isVector()) {
8899       // If we're constructing lane 0, initialize to start from poison.
8900       if (State.Instance->Lane == 0) {
8901         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8902         Value *Poison = PoisonValue::get(
8903             VectorType::get(getUnderlyingValue()->getType(), State.VF));
8904         State.ValueMap.setVectorValue(getUnderlyingInstr(),
8905                                       State.Instance->Part, Poison);
8906       }
8907       State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
8908                                            *State.Instance);
8909     }
8910     return;
8911   }
8912 
8913   // Generate scalar instances for all VF lanes of all UF parts, unless the
8914   // instruction is uniform inwhich case generate only the first lane for each
8915   // of the UF parts.
8916   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8917   assert((!State.VF.isScalable() || IsUniform) &&
8918          "Can't scalarize a scalable vector");
8919   for (unsigned Part = 0; Part < State.UF; ++Part)
8920     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8921       State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
8922                                       IsPredicated, State);
8923 }
8924 
8925 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8926   assert(State.Instance && "Branch on Mask works only on single instance.");
8927 
8928   unsigned Part = State.Instance->Part;
8929   unsigned Lane = State.Instance->Lane;
8930 
8931   Value *ConditionBit = nullptr;
8932   VPValue *BlockInMask = getMask();
8933   if (BlockInMask) {
8934     ConditionBit = State.get(BlockInMask, Part);
8935     if (ConditionBit->getType()->isVectorTy())
8936       ConditionBit = State.Builder.CreateExtractElement(
8937           ConditionBit, State.Builder.getInt32(Lane));
8938   } else // Block in mask is all-one.
8939     ConditionBit = State.Builder.getTrue();
8940 
8941   // Replace the temporary unreachable terminator with a new conditional branch,
8942   // whose two destinations will be set later when they are created.
8943   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8944   assert(isa<UnreachableInst>(CurrentTerminator) &&
8945          "Expected to replace unreachable terminator with conditional branch.");
8946   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8947   CondBr->setSuccessor(0, nullptr);
8948   ReplaceInstWithInst(CurrentTerminator, CondBr);
8949 }
8950 
8951 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8952   assert(State.Instance && "Predicated instruction PHI works per instance.");
8953   Instruction *ScalarPredInst =
8954       cast<Instruction>(State.get(getOperand(0), *State.Instance));
8955   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8956   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8957   assert(PredicatingBB && "Predicated block has no single predecessor.");
8958 
8959   // By current pack/unpack logic we need to generate only a single phi node: if
8960   // a vector value for the predicated instruction exists at this point it means
8961   // the instruction has vector users only, and a phi for the vector value is
8962   // needed. In this case the recipe of the predicated instruction is marked to
8963   // also do that packing, thereby "hoisting" the insert-element sequence.
8964   // Otherwise, a phi node for the scalar value is needed.
8965   unsigned Part = State.Instance->Part;
8966   Instruction *PredInst =
8967       cast<Instruction>(getOperand(0)->getUnderlyingValue());
8968   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8969     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8970     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8971     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8972     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8973     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8974     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8975   } else {
8976     Type *PredInstType = PredInst->getType();
8977     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8978     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB);
8979     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8980     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8981   }
8982 }
8983 
8984 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8985   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
8986   State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
8987                                         StoredValue ? nullptr : getVPValue(),
8988                                         getAddr(), StoredValue, getMask());
8989 }
8990 
8991 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8992 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8993 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8994 // for predication.
8995 static ScalarEpilogueLowering getScalarEpilogueLowering(
8996     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8997     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8998     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8999     LoopVectorizationLegality &LVL) {
9000   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9001   // don't look at hints or options, and don't request a scalar epilogue.
9002   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9003   // LoopAccessInfo (due to code dependency and not being able to reliably get
9004   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9005   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9006   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9007   // back to the old way and vectorize with versioning when forced. See D81345.)
9008   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9009                                                       PGSOQueryType::IRPass) &&
9010                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9011     return CM_ScalarEpilogueNotAllowedOptSize;
9012 
9013   // 2) If set, obey the directives
9014   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9015     switch (PreferPredicateOverEpilogue) {
9016     case PreferPredicateTy::ScalarEpilogue:
9017       return CM_ScalarEpilogueAllowed;
9018     case PreferPredicateTy::PredicateElseScalarEpilogue:
9019       return CM_ScalarEpilogueNotNeededUsePredicate;
9020     case PreferPredicateTy::PredicateOrDontVectorize:
9021       return CM_ScalarEpilogueNotAllowedUsePredicate;
9022     };
9023   }
9024 
9025   // 3) If set, obey the hints
9026   switch (Hints.getPredicate()) {
9027   case LoopVectorizeHints::FK_Enabled:
9028     return CM_ScalarEpilogueNotNeededUsePredicate;
9029   case LoopVectorizeHints::FK_Disabled:
9030     return CM_ScalarEpilogueAllowed;
9031   };
9032 
9033   // 4) if the TTI hook indicates this is profitable, request predication.
9034   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
9035                                        LVL.getLAI()))
9036     return CM_ScalarEpilogueNotNeededUsePredicate;
9037 
9038   return CM_ScalarEpilogueAllowed;
9039 }
9040 
9041 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
9042                            unsigned Part) {
9043   set(Def, V, Part);
9044   ILV->setVectorValue(IRDef, Part, V);
9045 }
9046 
9047 // Process the loop in the VPlan-native vectorization path. This path builds
9048 // VPlan upfront in the vectorization pipeline, which allows to apply
9049 // VPlan-to-VPlan transformations from the very beginning without modifying the
9050 // input LLVM IR.
9051 static bool processLoopInVPlanNativePath(
9052     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9053     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9054     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9055     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9056     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
9057 
9058   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9059     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9060     return false;
9061   }
9062   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9063   Function *F = L->getHeader()->getParent();
9064   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9065 
9066   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9067       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
9068 
9069   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9070                                 &Hints, IAI);
9071   // Use the planner for outer loop vectorization.
9072   // TODO: CM is not used at this point inside the planner. Turn CM into an
9073   // optional argument if we don't need it in the future.
9074   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
9075 
9076   // Get user vectorization factor.
9077   ElementCount UserVF = Hints.getWidth();
9078 
9079   // Plan how to best vectorize, return the best VF and its cost.
9080   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9081 
9082   // If we are stress testing VPlan builds, do not attempt to generate vector
9083   // code. Masked vector code generation support will follow soon.
9084   // Also, do not attempt to vectorize if no vector code will be produced.
9085   if (VPlanBuildStressTest || EnableVPlanPredication ||
9086       VectorizationFactor::Disabled() == VF)
9087     return false;
9088 
9089   LVP.setBestPlan(VF.Width, 1);
9090 
9091   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
9092                          &CM, BFI, PSI);
9093   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9094                     << L->getHeader()->getParent()->getName() << "\"\n");
9095   LVP.executePlan(LB, DT);
9096 
9097   // Mark the loop as already vectorized to avoid vectorizing again.
9098   Hints.setAlreadyVectorized();
9099 
9100   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9101   return true;
9102 }
9103 
9104 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9105     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9106                                !EnableLoopInterleaving),
9107       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9108                               !EnableLoopVectorization) {}
9109 
9110 bool LoopVectorizePass::processLoop(Loop *L) {
9111   assert((EnableVPlanNativePath || L->isInnermost()) &&
9112          "VPlan-native path is not enabled. Only process inner loops.");
9113 
9114 #ifndef NDEBUG
9115   const std::string DebugLocStr = getDebugLocString(L);
9116 #endif /* NDEBUG */
9117 
9118   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
9119                     << L->getHeader()->getParent()->getName() << "\" from "
9120                     << DebugLocStr << "\n");
9121 
9122   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
9123 
9124   LLVM_DEBUG(
9125       dbgs() << "LV: Loop hints:"
9126              << " force="
9127              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9128                      ? "disabled"
9129                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9130                             ? "enabled"
9131                             : "?"))
9132              << " width=" << Hints.getWidth()
9133              << " unroll=" << Hints.getInterleave() << "\n");
9134 
9135   // Function containing loop
9136   Function *F = L->getHeader()->getParent();
9137 
9138   // Looking at the diagnostic output is the only way to determine if a loop
9139   // was vectorized (other than looking at the IR or machine code), so it
9140   // is important to generate an optimization remark for each loop. Most of
9141   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9142   // generated as OptimizationRemark and OptimizationRemarkMissed are
9143   // less verbose reporting vectorized loops and unvectorized loops that may
9144   // benefit from vectorization, respectively.
9145 
9146   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9147     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9148     return false;
9149   }
9150 
9151   PredicatedScalarEvolution PSE(*SE, *L);
9152 
9153   // Check if it is legal to vectorize the loop.
9154   LoopVectorizationRequirements Requirements(*ORE);
9155   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
9156                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9157   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9158     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9159     Hints.emitRemarkWithHints();
9160     return false;
9161   }
9162 
9163   // Check the function attributes and profiles to find out if this function
9164   // should be optimized for size.
9165   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9166       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
9167 
9168   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9169   // here. They may require CFG and instruction level transformations before
9170   // even evaluating whether vectorization is profitable. Since we cannot modify
9171   // the incoming IR, we need to build VPlan upfront in the vectorization
9172   // pipeline.
9173   if (!L->isInnermost())
9174     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9175                                         ORE, BFI, PSI, Hints);
9176 
9177   assert(L->isInnermost() && "Inner loop expected.");
9178 
9179   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9180   // count by optimizing for size, to minimize overheads.
9181   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9182   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9183     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9184                       << "This loop is worth vectorizing only if no scalar "
9185                       << "iteration overheads are incurred.");
9186     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9187       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9188     else {
9189       LLVM_DEBUG(dbgs() << "\n");
9190       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9191     }
9192   }
9193 
9194   // Check the function attributes to see if implicit floats are allowed.
9195   // FIXME: This check doesn't seem possibly correct -- what if the loop is
9196   // an integer loop and the vector instructions selected are purely integer
9197   // vector instructions?
9198   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9199     reportVectorizationFailure(
9200         "Can't vectorize when the NoImplicitFloat attribute is used",
9201         "loop not vectorized due to NoImplicitFloat attribute",
9202         "NoImplicitFloat", ORE, L);
9203     Hints.emitRemarkWithHints();
9204     return false;
9205   }
9206 
9207   // Check if the target supports potentially unsafe FP vectorization.
9208   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9209   // for the target we're vectorizing for, to make sure none of the
9210   // additional fp-math flags can help.
9211   if (Hints.isPotentiallyUnsafe() &&
9212       TTI->isFPVectorizationPotentiallyUnsafe()) {
9213     reportVectorizationFailure(
9214         "Potentially unsafe FP op prevents vectorization",
9215         "loop not vectorized due to unsafe FP support.",
9216         "UnsafeFP", ORE, L);
9217     Hints.emitRemarkWithHints();
9218     return false;
9219   }
9220 
9221   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9222   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9223 
9224   // If an override option has been passed in for interleaved accesses, use it.
9225   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9226     UseInterleaved = EnableInterleavedMemAccesses;
9227 
9228   // Analyze interleaved memory accesses.
9229   if (UseInterleaved) {
9230     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9231   }
9232 
9233   // Use the cost model.
9234   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9235                                 F, &Hints, IAI);
9236   CM.collectValuesToIgnore();
9237 
9238   // Use the planner for vectorization.
9239   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
9240 
9241   // Get user vectorization factor and interleave count.
9242   ElementCount UserVF = Hints.getWidth();
9243   unsigned UserIC = Hints.getInterleave();
9244 
9245   // Plan how to best vectorize, return the best VF and its cost.
9246   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9247 
9248   VectorizationFactor VF = VectorizationFactor::Disabled();
9249   unsigned IC = 1;
9250 
9251   if (MaybeVF) {
9252     VF = *MaybeVF;
9253     // Select the interleave count.
9254     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9255   }
9256 
9257   // Identify the diagnostic messages that should be produced.
9258   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9259   bool VectorizeLoop = true, InterleaveLoop = true;
9260   if (Requirements.doesNotMeet(F, L, Hints)) {
9261     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
9262                          "requirements.\n");
9263     Hints.emitRemarkWithHints();
9264     return false;
9265   }
9266 
9267   if (VF.Width.isScalar()) {
9268     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9269     VecDiagMsg = std::make_pair(
9270         "VectorizationNotBeneficial",
9271         "the cost-model indicates that vectorization is not beneficial");
9272     VectorizeLoop = false;
9273   }
9274 
9275   if (!MaybeVF && UserIC > 1) {
9276     // Tell the user interleaving was avoided up-front, despite being explicitly
9277     // requested.
9278     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9279                          "interleaving should be avoided up front\n");
9280     IntDiagMsg = std::make_pair(
9281         "InterleavingAvoided",
9282         "Ignoring UserIC, because interleaving was avoided up front");
9283     InterleaveLoop = false;
9284   } else if (IC == 1 && UserIC <= 1) {
9285     // Tell the user interleaving is not beneficial.
9286     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9287     IntDiagMsg = std::make_pair(
9288         "InterleavingNotBeneficial",
9289         "the cost-model indicates that interleaving is not beneficial");
9290     InterleaveLoop = false;
9291     if (UserIC == 1) {
9292       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9293       IntDiagMsg.second +=
9294           " and is explicitly disabled or interleave count is set to 1";
9295     }
9296   } else if (IC > 1 && UserIC == 1) {
9297     // Tell the user interleaving is beneficial, but it explicitly disabled.
9298     LLVM_DEBUG(
9299         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9300     IntDiagMsg = std::make_pair(
9301         "InterleavingBeneficialButDisabled",
9302         "the cost-model indicates that interleaving is beneficial "
9303         "but is explicitly disabled or interleave count is set to 1");
9304     InterleaveLoop = false;
9305   }
9306 
9307   // Override IC if user provided an interleave count.
9308   IC = UserIC > 0 ? UserIC : IC;
9309 
9310   // Emit diagnostic messages, if any.
9311   const char *VAPassName = Hints.vectorizeAnalysisPassName();
9312   if (!VectorizeLoop && !InterleaveLoop) {
9313     // Do not vectorize or interleaving the loop.
9314     ORE->emit([&]() {
9315       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9316                                       L->getStartLoc(), L->getHeader())
9317              << VecDiagMsg.second;
9318     });
9319     ORE->emit([&]() {
9320       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9321                                       L->getStartLoc(), L->getHeader())
9322              << IntDiagMsg.second;
9323     });
9324     return false;
9325   } else if (!VectorizeLoop && InterleaveLoop) {
9326     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9327     ORE->emit([&]() {
9328       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9329                                         L->getStartLoc(), L->getHeader())
9330              << VecDiagMsg.second;
9331     });
9332   } else if (VectorizeLoop && !InterleaveLoop) {
9333     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9334                       << ") in " << DebugLocStr << '\n');
9335     ORE->emit([&]() {
9336       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9337                                         L->getStartLoc(), L->getHeader())
9338              << IntDiagMsg.second;
9339     });
9340   } else if (VectorizeLoop && InterleaveLoop) {
9341     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9342                       << ") in " << DebugLocStr << '\n');
9343     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9344   }
9345 
9346   LVP.setBestPlan(VF.Width, IC);
9347 
9348   using namespace ore;
9349   bool DisableRuntimeUnroll = false;
9350   MDNode *OrigLoopID = L->getLoopID();
9351 
9352   if (!VectorizeLoop) {
9353     assert(IC > 1 && "interleave count should not be 1 or 0");
9354     // If we decided that it is not legal to vectorize the loop, then
9355     // interleave it.
9356     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
9357                                BFI, PSI);
9358     LVP.executePlan(Unroller, DT);
9359 
9360     ORE->emit([&]() {
9361       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9362                                 L->getHeader())
9363              << "interleaved loop (interleaved count: "
9364              << NV("InterleaveCount", IC) << ")";
9365     });
9366   } else {
9367     // If we decided that it is *legal* to vectorize the loop, then do it.
9368 
9369     // Consider vectorizing the epilogue too if it's profitable.
9370     VectorizationFactor EpilogueVF =
9371       CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
9372     if (EpilogueVF.Width.isVector()) {
9373 
9374       // The first pass vectorizes the main loop and creates a scalar epilogue
9375       // to be vectorized by executing the plan (potentially with a different
9376       // factor) again shortly afterwards.
9377       EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
9378                                         EpilogueVF.Width.getKnownMinValue(), 1);
9379       EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
9380                                          &LVL, &CM, BFI, PSI);
9381 
9382       LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
9383       LVP.executePlan(MainILV, DT);
9384       ++LoopsVectorized;
9385 
9386       simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9387       formLCSSARecursively(*L, *DT, LI, SE);
9388 
9389       // Second pass vectorizes the epilogue and adjusts the control flow
9390       // edges from the first pass.
9391       LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
9392       EPI.MainLoopVF = EPI.EpilogueVF;
9393       EPI.MainLoopUF = EPI.EpilogueUF;
9394       EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9395                                                ORE, EPI, &LVL, &CM, BFI, PSI);
9396       LVP.executePlan(EpilogILV, DT);
9397       ++LoopsEpilogueVectorized;
9398 
9399       if (!MainILV.areSafetyChecksAdded())
9400         DisableRuntimeUnroll = true;
9401     } else {
9402       InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
9403                              &LVL, &CM, BFI, PSI);
9404       LVP.executePlan(LB, DT);
9405       ++LoopsVectorized;
9406 
9407       // Add metadata to disable runtime unrolling a scalar loop when there are
9408       // no runtime checks about strides and memory. A scalar loop that is
9409       // rarely used is not worth unrolling.
9410       if (!LB.areSafetyChecksAdded())
9411         DisableRuntimeUnroll = true;
9412     }
9413 
9414     // Report the vectorization decision.
9415     ORE->emit([&]() {
9416       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
9417                                 L->getHeader())
9418              << "vectorized loop (vectorization width: "
9419              << NV("VectorizationFactor", VF.Width)
9420              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
9421     });
9422   }
9423 
9424   Optional<MDNode *> RemainderLoopID =
9425       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
9426                                       LLVMLoopVectorizeFollowupEpilogue});
9427   if (RemainderLoopID.hasValue()) {
9428     L->setLoopID(RemainderLoopID.getValue());
9429   } else {
9430     if (DisableRuntimeUnroll)
9431       AddRuntimeUnrollDisableMetaData(L);
9432 
9433     // Mark the loop as already vectorized to avoid vectorizing again.
9434     Hints.setAlreadyVectorized();
9435   }
9436 
9437   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9438   return true;
9439 }
9440 
9441 LoopVectorizeResult LoopVectorizePass::runImpl(
9442     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
9443     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
9444     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
9445     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
9446     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
9447   SE = &SE_;
9448   LI = &LI_;
9449   TTI = &TTI_;
9450   DT = &DT_;
9451   BFI = &BFI_;
9452   TLI = TLI_;
9453   AA = &AA_;
9454   AC = &AC_;
9455   GetLAA = &GetLAA_;
9456   DB = &DB_;
9457   ORE = &ORE_;
9458   PSI = PSI_;
9459 
9460   // Don't attempt if
9461   // 1. the target claims to have no vector registers, and
9462   // 2. interleaving won't help ILP.
9463   //
9464   // The second condition is necessary because, even if the target has no
9465   // vector registers, loop vectorization may still enable scalar
9466   // interleaving.
9467   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9468       TTI->getMaxInterleaveFactor(1) < 2)
9469     return LoopVectorizeResult(false, false);
9470 
9471   bool Changed = false, CFGChanged = false;
9472 
9473   // The vectorizer requires loops to be in simplified form.
9474   // Since simplification may add new inner loops, it has to run before the
9475   // legality and profitability checks. This means running the loop vectorizer
9476   // will simplify all loops, regardless of whether anything end up being
9477   // vectorized.
9478   for (auto &L : *LI)
9479     Changed |= CFGChanged |=
9480         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9481 
9482   // Build up a worklist of inner-loops to vectorize. This is necessary as
9483   // the act of vectorizing or partially unrolling a loop creates new loops
9484   // and can invalidate iterators across the loops.
9485   SmallVector<Loop *, 8> Worklist;
9486 
9487   for (Loop *L : *LI)
9488     collectSupportedLoops(*L, LI, ORE, Worklist);
9489 
9490   LoopsAnalyzed += Worklist.size();
9491 
9492   // Now walk the identified inner loops.
9493   while (!Worklist.empty()) {
9494     Loop *L = Worklist.pop_back_val();
9495 
9496     // For the inner loops we actually process, form LCSSA to simplify the
9497     // transform.
9498     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
9499 
9500     Changed |= CFGChanged |= processLoop(L);
9501   }
9502 
9503   // Process each loop nest in the function.
9504   return LoopVectorizeResult(Changed, CFGChanged);
9505 }
9506 
9507 PreservedAnalyses LoopVectorizePass::run(Function &F,
9508                                          FunctionAnalysisManager &AM) {
9509     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
9510     auto &LI = AM.getResult<LoopAnalysis>(F);
9511     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
9512     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
9513     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
9514     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
9515     auto &AA = AM.getResult<AAManager>(F);
9516     auto &AC = AM.getResult<AssumptionAnalysis>(F);
9517     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
9518     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
9519     MemorySSA *MSSA = EnableMSSALoopDependency
9520                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
9521                           : nullptr;
9522 
9523     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
9524     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
9525         [&](Loop &L) -> const LoopAccessInfo & {
9526       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
9527                                         TLI, TTI, nullptr, MSSA};
9528       return LAM.getResult<LoopAccessAnalysis>(L, AR);
9529     };
9530     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
9531     ProfileSummaryInfo *PSI =
9532         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
9533     LoopVectorizeResult Result =
9534         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
9535     if (!Result.MadeAnyChange)
9536       return PreservedAnalyses::all();
9537     PreservedAnalyses PA;
9538 
9539     // We currently do not preserve loopinfo/dominator analyses with outer loop
9540     // vectorization. Until this is addressed, mark these analyses as preserved
9541     // only for non-VPlan-native path.
9542     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
9543     if (!EnableVPlanNativePath) {
9544       PA.preserve<LoopAnalysis>();
9545       PA.preserve<DominatorTreeAnalysis>();
9546     }
9547     PA.preserve<BasicAA>();
9548     PA.preserve<GlobalsAA>();
9549     if (!Result.MadeCFGChange)
9550       PA.preserveSet<CFGAnalyses>();
9551     return PA;
9552 }
9553