1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142 #include "llvm/Transforms/Utils/SizeOpts.h"
143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144 #include <algorithm>
145 #include <cassert>
146 #include <cstdint>
147 #include <cstdlib>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
202 // that predication is preferred, and this lists all options. I.e., the
203 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
204 // and predicate the instructions accordingly. If tail-folding fails, there are
205 // different fallback strategies depending on these values:
206 namespace PreferPredicateTy {
207   enum Option {
208     ScalarEpilogue = 0,
209     PredicateElseScalarEpilogue,
210     PredicateOrDontVectorize
211   };
212 } // namespace PreferPredicateTy
213 
214 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
215     "prefer-predicate-over-epilogue",
216     cl::init(PreferPredicateTy::ScalarEpilogue),
217     cl::Hidden,
218     cl::desc("Tail-folding and predication preferences over creating a scalar "
219              "epilogue loop."),
220     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
221                          "scalar-epilogue",
222                          "Don't tail-predicate loops, create scalar epilogue"),
223               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
224                          "predicate-else-scalar-epilogue",
225                          "prefer tail-folding, create scalar epilogue if tail "
226                          "folding fails."),
227               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
228                          "predicate-dont-vectorize",
229                          "prefers tail-folding, don't attempt vectorization if "
230                          "tail-folding fails.")));
231 
232 static cl::opt<bool> MaximizeBandwidth(
233     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
234     cl::desc("Maximize bandwidth when selecting vectorization factor which "
235              "will be determined by the smallest type in loop."));
236 
237 static cl::opt<bool> EnableInterleavedMemAccesses(
238     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
239     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
240 
241 /// An interleave-group may need masking if it resides in a block that needs
242 /// predication, or in order to mask away gaps.
243 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
244     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
245     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
246 
247 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
248     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
249     cl::desc("We don't interleave loops with a estimated constant trip count "
250              "below this number"));
251 
252 static cl::opt<unsigned> ForceTargetNumScalarRegs(
253     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
254     cl::desc("A flag that overrides the target's number of scalar registers."));
255 
256 static cl::opt<unsigned> ForceTargetNumVectorRegs(
257     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
258     cl::desc("A flag that overrides the target's number of vector registers."));
259 
260 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
261     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
262     cl::desc("A flag that overrides the target's max interleave factor for "
263              "scalar loops."));
264 
265 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
266     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
267     cl::desc("A flag that overrides the target's max interleave factor for "
268              "vectorized loops."));
269 
270 static cl::opt<unsigned> ForceTargetInstructionCost(
271     "force-target-instruction-cost", cl::init(0), cl::Hidden,
272     cl::desc("A flag that overrides the target's expected cost for "
273              "an instruction to a single constant value. Mostly "
274              "useful for getting consistent testing."));
275 
276 static cl::opt<bool> ForceTargetSupportsScalableVectors(
277     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
278     cl::desc(
279         "Pretend that scalable vectors are supported, even if the target does "
280         "not support them. This flag should only be used for testing."));
281 
282 static cl::opt<unsigned> SmallLoopCost(
283     "small-loop-cost", cl::init(20), cl::Hidden,
284     cl::desc(
285         "The cost of a loop that is considered 'small' by the interleaver."));
286 
287 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
288     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
289     cl::desc("Enable the use of the block frequency analysis to access PGO "
290              "heuristics minimizing code growth in cold regions and being more "
291              "aggressive in hot regions."));
292 
293 // Runtime interleave loops for load/store throughput.
294 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
295     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
296     cl::desc(
297         "Enable runtime interleaving until load/store ports are saturated"));
298 
299 /// Interleave small loops with scalar reductions.
300 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
301     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
302     cl::desc("Enable interleaving for loops with small iteration counts that "
303              "contain scalar reductions to expose ILP."));
304 
305 /// The number of stores in a loop that are allowed to need predication.
306 static cl::opt<unsigned> NumberOfStoresToPredicate(
307     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
308     cl::desc("Max number of stores to be predicated behind an if."));
309 
310 static cl::opt<bool> EnableIndVarRegisterHeur(
311     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
312     cl::desc("Count the induction variable only once when interleaving"));
313 
314 static cl::opt<bool> EnableCondStoresVectorization(
315     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
316     cl::desc("Enable if predication of stores during vectorization."));
317 
318 static cl::opt<unsigned> MaxNestedScalarReductionIC(
319     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
320     cl::desc("The maximum interleave count to use when interleaving a scalar "
321              "reduction in a nested loop."));
322 
323 static cl::opt<bool>
324     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
325                            cl::Hidden,
326                            cl::desc("Prefer in-loop vector reductions, "
327                                     "overriding the targets preference."));
328 
329 static cl::opt<bool> PreferPredicatedReductionSelect(
330     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
331     cl::desc(
332         "Prefer predicating a reduction operation over an after loop select."));
333 
334 cl::opt<bool> EnableVPlanNativePath(
335     "enable-vplan-native-path", cl::init(false), cl::Hidden,
336     cl::desc("Enable VPlan-native vectorization path with "
337              "support for outer loop vectorization."));
338 
339 // FIXME: Remove this switch once we have divergence analysis. Currently we
340 // assume divergent non-backedge branches when this switch is true.
341 cl::opt<bool> EnableVPlanPredication(
342     "enable-vplan-predication", cl::init(false), cl::Hidden,
343     cl::desc("Enable VPlan-native vectorization path predicator with "
344              "support for outer loop vectorization."));
345 
346 // This flag enables the stress testing of the VPlan H-CFG construction in the
347 // VPlan-native vectorization path. It must be used in conjuction with
348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
349 // verification of the H-CFGs built.
350 static cl::opt<bool> VPlanBuildStressTest(
351     "vplan-build-stress-test", cl::init(false), cl::Hidden,
352     cl::desc(
353         "Build VPlan for every supported loop nest in the function and bail "
354         "out right after the build (stress test the VPlan H-CFG construction "
355         "in the VPlan-native vectorization path)."));
356 
357 cl::opt<bool> llvm::EnableLoopInterleaving(
358     "interleave-loops", cl::init(true), cl::Hidden,
359     cl::desc("Enable loop interleaving in Loop vectorization passes"));
360 cl::opt<bool> llvm::EnableLoopVectorization(
361     "vectorize-loops", cl::init(true), cl::Hidden,
362     cl::desc("Run the Loop vectorization passes"));
363 
364 /// A helper function that returns the type of loaded or stored value.
365 static Type *getMemInstValueType(Value *I) {
366   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
367          "Expected Load or Store instruction");
368   if (auto *LI = dyn_cast<LoadInst>(I))
369     return LI->getType();
370   return cast<StoreInst>(I)->getValueOperand()->getType();
371 }
372 
373 /// A helper function that returns true if the given type is irregular. The
374 /// type is irregular if its allocated size doesn't equal the store size of an
375 /// element of the corresponding vector type at the given vectorization factor.
376 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
377   // Determine if an array of VF elements of type Ty is "bitcast compatible"
378   // with a <VF x Ty> vector.
379   if (VF.isVector()) {
380     auto *VectorTy = VectorType::get(Ty, VF);
381     return TypeSize::get(VF.getKnownMinValue() *
382                              DL.getTypeAllocSize(Ty).getFixedValue(),
383                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
384   }
385 
386   // If the vectorization factor is one, we just check if an array of type Ty
387   // requires padding between elements.
388   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
389 }
390 
391 /// A helper function that returns the reciprocal of the block probability of
392 /// predicated blocks. If we return X, we are assuming the predicated block
393 /// will execute once for every X iterations of the loop header.
394 ///
395 /// TODO: We should use actual block probability here, if available. Currently,
396 ///       we always assume predicated blocks have a 50% chance of executing.
397 static unsigned getReciprocalPredBlockProb() { return 2; }
398 
399 /// A helper function that adds a 'fast' flag to floating-point operations.
400 static Value *addFastMathFlag(Value *V) {
401   if (isa<FPMathOperator>(V))
402     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
403   return V;
404 }
405 
406 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
407   if (isa<FPMathOperator>(V))
408     cast<Instruction>(V)->setFastMathFlags(FMF);
409   return V;
410 }
411 
412 /// A helper function that returns an integer or floating-point constant with
413 /// value C.
414 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
415   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
416                            : ConstantFP::get(Ty, C);
417 }
418 
419 /// Returns "best known" trip count for the specified loop \p L as defined by
420 /// the following procedure:
421 ///   1) Returns exact trip count if it is known.
422 ///   2) Returns expected trip count according to profile data if any.
423 ///   3) Returns upper bound estimate if it is known.
424 ///   4) Returns None if all of the above failed.
425 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
426   // Check if exact trip count is known.
427   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
428     return ExpectedTC;
429 
430   // Check if there is an expected trip count available from profile data.
431   if (LoopVectorizeWithBlockFrequency)
432     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
433       return EstimatedTC;
434 
435   // Check if upper bound estimate is known.
436   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
437     return ExpectedTC;
438 
439   return None;
440 }
441 
442 namespace llvm {
443 
444 /// InnerLoopVectorizer vectorizes loops which contain only one basic
445 /// block to a specified vectorization factor (VF).
446 /// This class performs the widening of scalars into vectors, or multiple
447 /// scalars. This class also implements the following features:
448 /// * It inserts an epilogue loop for handling loops that don't have iteration
449 ///   counts that are known to be a multiple of the vectorization factor.
450 /// * It handles the code generation for reduction variables.
451 /// * Scalarization (implementation using scalars) of un-vectorizable
452 ///   instructions.
453 /// InnerLoopVectorizer does not perform any vectorization-legality
454 /// checks, and relies on the caller to check for the different legality
455 /// aspects. The InnerLoopVectorizer relies on the
456 /// LoopVectorizationLegality class to provide information about the induction
457 /// and reduction variables that were found to a given vectorization factor.
458 class InnerLoopVectorizer {
459 public:
460   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
461                       LoopInfo *LI, DominatorTree *DT,
462                       const TargetLibraryInfo *TLI,
463                       const TargetTransformInfo *TTI, AssumptionCache *AC,
464                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
465                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
466                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
467                       ProfileSummaryInfo *PSI)
468       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
469         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
470         Builder(PSE.getSE()->getContext()),
471         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
472         BFI(BFI), PSI(PSI) {
473     // Query this against the original loop and save it here because the profile
474     // of the original loop header may change as the transformation happens.
475     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
476         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
477   }
478 
479   virtual ~InnerLoopVectorizer() = default;
480 
481   /// Create a new empty loop that will contain vectorized instructions later
482   /// on, while the old loop will be used as the scalar remainder. Control flow
483   /// is generated around the vectorized (and scalar epilogue) loops consisting
484   /// of various checks and bypasses. Return the pre-header block of the new
485   /// loop.
486   /// In the case of epilogue vectorization, this function is overriden to
487   /// handle the more complex control flow around the loops.
488   virtual BasicBlock *createVectorizedLoopSkeleton();
489 
490   /// Widen a single instruction within the innermost loop.
491   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
492                         VPTransformState &State);
493 
494   /// Widen a single call instruction within the innermost loop.
495   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
496                             VPTransformState &State);
497 
498   /// Widen a single select instruction within the innermost loop.
499   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
500                               bool InvariantCond, VPTransformState &State);
501 
502   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
503   void fixVectorizedLoop();
504 
505   // Return true if any runtime check is added.
506   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
507 
508   /// A type for vectorized values in the new loop. Each value from the
509   /// original loop, when vectorized, is represented by UF vector values in the
510   /// new unrolled loop, where UF is the unroll factor.
511   using VectorParts = SmallVector<Value *, 2>;
512 
513   /// Vectorize a single GetElementPtrInst based on information gathered and
514   /// decisions taken during planning.
515   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
516                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
517                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
518 
519   /// Vectorize a single PHINode in a block. This method handles the induction
520   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
521   /// arbitrary length vectors.
522   void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
523                            Value *StartV, unsigned UF, ElementCount VF);
524 
525   /// A helper function to scalarize a single Instruction in the innermost loop.
526   /// Generates a sequence of scalar instances for each lane between \p MinLane
527   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
528   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
529   /// Instr's operands.
530   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
531                             const VPIteration &Instance, bool IfPredicateInstr,
532                             VPTransformState &State);
533 
534   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
535   /// is provided, the integer induction variable will first be truncated to
536   /// the corresponding type.
537   void widenIntOrFpInduction(PHINode *IV, Value *Start,
538                              TruncInst *Trunc = nullptr);
539 
540   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
541   /// vector or scalar value on-demand if one is not yet available. When
542   /// vectorizing a loop, we visit the definition of an instruction before its
543   /// uses. When visiting the definition, we either vectorize or scalarize the
544   /// instruction, creating an entry for it in the corresponding map. (In some
545   /// cases, such as induction variables, we will create both vector and scalar
546   /// entries.) Then, as we encounter uses of the definition, we derive values
547   /// for each scalar or vector use unless such a value is already available.
548   /// For example, if we scalarize a definition and one of its uses is vector,
549   /// we build the required vector on-demand with an insertelement sequence
550   /// when visiting the use. Otherwise, if the use is scalar, we can use the
551   /// existing scalar definition.
552   ///
553   /// Return a value in the new loop corresponding to \p V from the original
554   /// loop at unroll index \p Part. If the value has already been vectorized,
555   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
556   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
557   /// a new vector value on-demand by inserting the scalar values into a vector
558   /// with an insertelement sequence. If the value has been neither vectorized
559   /// nor scalarized, it must be loop invariant, so we simply broadcast the
560   /// value into a vector.
561   Value *getOrCreateVectorValue(Value *V, unsigned Part);
562 
563   void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
564     VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
565   }
566 
567   /// Return a value in the new loop corresponding to \p V from the original
568   /// loop at unroll and vector indices \p Instance. If the value has been
569   /// vectorized but not scalarized, the necessary extractelement instruction
570   /// will be generated.
571   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
572 
573   /// Construct the vector value of a scalarized value \p V one lane at a time.
574   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
575 
576   /// Try to vectorize interleaved access group \p Group with the base address
577   /// given in \p Addr, optionally masking the vector operations if \p
578   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
579   /// values in the vectorized loop.
580   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
581                                 ArrayRef<VPValue *> VPDefs,
582                                 VPTransformState &State, VPValue *Addr,
583                                 ArrayRef<VPValue *> StoredValues,
584                                 VPValue *BlockInMask = nullptr);
585 
586   /// Vectorize Load and Store instructions with the base address given in \p
587   /// Addr, optionally masking the vector operations if \p BlockInMask is
588   /// non-null. Use \p State to translate given VPValues to IR values in the
589   /// vectorized loop.
590   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
591                                   VPValue *Def, VPValue *Addr,
592                                   VPValue *StoredValue, VPValue *BlockInMask);
593 
594   /// Set the debug location in the builder using the debug location in
595   /// the instruction.
596   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
597 
598   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
599   void fixNonInductionPHIs(void);
600 
601 protected:
602   friend class LoopVectorizationPlanner;
603 
604   /// A small list of PHINodes.
605   using PhiVector = SmallVector<PHINode *, 4>;
606 
607   /// A type for scalarized values in the new loop. Each value from the
608   /// original loop, when scalarized, is represented by UF x VF scalar values
609   /// in the new unrolled loop, where UF is the unroll factor and VF is the
610   /// vectorization factor.
611   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
612 
613   /// Set up the values of the IVs correctly when exiting the vector loop.
614   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
615                     Value *CountRoundDown, Value *EndValue,
616                     BasicBlock *MiddleBlock);
617 
618   /// Create a new induction variable inside L.
619   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
620                                    Value *Step, Instruction *DL);
621 
622   /// Handle all cross-iteration phis in the header.
623   void fixCrossIterationPHIs();
624 
625   /// Fix a first-order recurrence. This is the second phase of vectorizing
626   /// this phi node.
627   void fixFirstOrderRecurrence(PHINode *Phi);
628 
629   /// Fix a reduction cross-iteration phi. This is the second phase of
630   /// vectorizing this phi node.
631   void fixReduction(PHINode *Phi);
632 
633   /// Clear NSW/NUW flags from reduction instructions if necessary.
634   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
635 
636   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
637   /// means we need to add the appropriate incoming value from the middle
638   /// block as exiting edges from the scalar epilogue loop (if present) are
639   /// already in place, and we exit the vector loop exclusively to the middle
640   /// block.
641   void fixLCSSAPHIs();
642 
643   /// Iteratively sink the scalarized operands of a predicated instruction into
644   /// the block that was created for it.
645   void sinkScalarOperands(Instruction *PredInst);
646 
647   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
648   /// represented as.
649   void truncateToMinimalBitwidths();
650 
651   /// Create a broadcast instruction. This method generates a broadcast
652   /// instruction (shuffle) for loop invariant values and for the induction
653   /// value. If this is the induction variable then we extend it to N, N+1, ...
654   /// this is needed because each iteration in the loop corresponds to a SIMD
655   /// element.
656   virtual Value *getBroadcastInstrs(Value *V);
657 
658   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
659   /// to each vector element of Val. The sequence starts at StartIndex.
660   /// \p Opcode is relevant for FP induction variable.
661   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
662                                Instruction::BinaryOps Opcode =
663                                Instruction::BinaryOpsEnd);
664 
665   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
666   /// variable on which to base the steps, \p Step is the size of the step, and
667   /// \p EntryVal is the value from the original loop that maps to the steps.
668   /// Note that \p EntryVal doesn't have to be an induction variable - it
669   /// can also be a truncate instruction.
670   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
671                         const InductionDescriptor &ID);
672 
673   /// Create a vector induction phi node based on an existing scalar one. \p
674   /// EntryVal is the value from the original loop that maps to the vector phi
675   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
676   /// truncate instruction, instead of widening the original IV, we widen a
677   /// version of the IV truncated to \p EntryVal's type.
678   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
679                                        Value *Step, Value *Start,
680                                        Instruction *EntryVal);
681 
682   /// Returns true if an instruction \p I should be scalarized instead of
683   /// vectorized for the chosen vectorization factor.
684   bool shouldScalarizeInstruction(Instruction *I) const;
685 
686   /// Returns true if we should generate a scalar version of \p IV.
687   bool needsScalarInduction(Instruction *IV) const;
688 
689   /// If there is a cast involved in the induction variable \p ID, which should
690   /// be ignored in the vectorized loop body, this function records the
691   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
692   /// cast. We had already proved that the casted Phi is equal to the uncasted
693   /// Phi in the vectorized loop (under a runtime guard), and therefore
694   /// there is no need to vectorize the cast - the same value can be used in the
695   /// vector loop for both the Phi and the cast.
696   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
697   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
698   ///
699   /// \p EntryVal is the value from the original loop that maps to the vector
700   /// phi node and is used to distinguish what is the IV currently being
701   /// processed - original one (if \p EntryVal is a phi corresponding to the
702   /// original IV) or the "newly-created" one based on the proof mentioned above
703   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
704   /// latter case \p EntryVal is a TruncInst and we must not record anything for
705   /// that IV, but it's error-prone to expect callers of this routine to care
706   /// about that, hence this explicit parameter.
707   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
708                                              const Instruction *EntryVal,
709                                              Value *VectorLoopValue,
710                                              unsigned Part,
711                                              unsigned Lane = UINT_MAX);
712 
713   /// Generate a shuffle sequence that will reverse the vector Vec.
714   virtual Value *reverseVector(Value *Vec);
715 
716   /// Returns (and creates if needed) the original loop trip count.
717   Value *getOrCreateTripCount(Loop *NewLoop);
718 
719   /// Returns (and creates if needed) the trip count of the widened loop.
720   Value *getOrCreateVectorTripCount(Loop *NewLoop);
721 
722   /// Returns a bitcasted value to the requested vector type.
723   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
724   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
725                                 const DataLayout &DL);
726 
727   /// Emit a bypass check to see if the vector trip count is zero, including if
728   /// it overflows.
729   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
730 
731   /// Emit a bypass check to see if all of the SCEV assumptions we've
732   /// had to make are correct.
733   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
734 
735   /// Emit bypass checks to check any memory assumptions we may have made.
736   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
737 
738   /// Compute the transformed value of Index at offset StartValue using step
739   /// StepValue.
740   /// For integer induction, returns StartValue + Index * StepValue.
741   /// For pointer induction, returns StartValue[Index * StepValue].
742   /// FIXME: The newly created binary instructions should contain nsw/nuw
743   /// flags, which can be found from the original scalar operations.
744   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
745                               const DataLayout &DL,
746                               const InductionDescriptor &ID) const;
747 
748   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
749   /// vector loop preheader, middle block and scalar preheader. Also
750   /// allocate a loop object for the new vector loop and return it.
751   Loop *createVectorLoopSkeleton(StringRef Prefix);
752 
753   /// Create new phi nodes for the induction variables to resume iteration count
754   /// in the scalar epilogue, from where the vectorized loop left off (given by
755   /// \p VectorTripCount).
756   /// In cases where the loop skeleton is more complicated (eg. epilogue
757   /// vectorization) and the resume values can come from an additional bypass
758   /// block, the \p AdditionalBypass pair provides information about the bypass
759   /// block and the end value on the edge from bypass to this loop.
760   void createInductionResumeValues(
761       Loop *L, Value *VectorTripCount,
762       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
763 
764   /// Complete the loop skeleton by adding debug MDs, creating appropriate
765   /// conditional branches in the middle block, preparing the builder and
766   /// running the verifier. Take in the vector loop \p L as argument, and return
767   /// the preheader of the completed vector loop.
768   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
769 
770   /// Add additional metadata to \p To that was not present on \p Orig.
771   ///
772   /// Currently this is used to add the noalias annotations based on the
773   /// inserted memchecks.  Use this for instructions that are *cloned* into the
774   /// vector loop.
775   void addNewMetadata(Instruction *To, const Instruction *Orig);
776 
777   /// Add metadata from one instruction to another.
778   ///
779   /// This includes both the original MDs from \p From and additional ones (\see
780   /// addNewMetadata).  Use this for *newly created* instructions in the vector
781   /// loop.
782   void addMetadata(Instruction *To, Instruction *From);
783 
784   /// Similar to the previous function but it adds the metadata to a
785   /// vector of instructions.
786   void addMetadata(ArrayRef<Value *> To, Instruction *From);
787 
788   /// Allow subclasses to override and print debug traces before/after vplan
789   /// execution, when trace information is requested.
790   virtual void printDebugTracesAtStart(){};
791   virtual void printDebugTracesAtEnd(){};
792 
793   /// The original loop.
794   Loop *OrigLoop;
795 
796   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
797   /// dynamic knowledge to simplify SCEV expressions and converts them to a
798   /// more usable form.
799   PredicatedScalarEvolution &PSE;
800 
801   /// Loop Info.
802   LoopInfo *LI;
803 
804   /// Dominator Tree.
805   DominatorTree *DT;
806 
807   /// Alias Analysis.
808   AAResults *AA;
809 
810   /// Target Library Info.
811   const TargetLibraryInfo *TLI;
812 
813   /// Target Transform Info.
814   const TargetTransformInfo *TTI;
815 
816   /// Assumption Cache.
817   AssumptionCache *AC;
818 
819   /// Interface to emit optimization remarks.
820   OptimizationRemarkEmitter *ORE;
821 
822   /// LoopVersioning.  It's only set up (non-null) if memchecks were
823   /// used.
824   ///
825   /// This is currently only used to add no-alias metadata based on the
826   /// memchecks.  The actually versioning is performed manually.
827   std::unique_ptr<LoopVersioning> LVer;
828 
829   /// The vectorization SIMD factor to use. Each vector will have this many
830   /// vector elements.
831   ElementCount VF;
832 
833   /// The vectorization unroll factor to use. Each scalar is vectorized to this
834   /// many different vector instructions.
835   unsigned UF;
836 
837   /// The builder that we use
838   IRBuilder<> Builder;
839 
840   // --- Vectorization state ---
841 
842   /// The vector-loop preheader.
843   BasicBlock *LoopVectorPreHeader;
844 
845   /// The scalar-loop preheader.
846   BasicBlock *LoopScalarPreHeader;
847 
848   /// Middle Block between the vector and the scalar.
849   BasicBlock *LoopMiddleBlock;
850 
851   /// The (unique) ExitBlock of the scalar loop.  Note that
852   /// there can be multiple exiting edges reaching this block.
853   BasicBlock *LoopExitBlock;
854 
855   /// The vector loop body.
856   BasicBlock *LoopVectorBody;
857 
858   /// The scalar loop body.
859   BasicBlock *LoopScalarBody;
860 
861   /// A list of all bypass blocks. The first block is the entry of the loop.
862   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
863 
864   /// The new Induction variable which was added to the new block.
865   PHINode *Induction = nullptr;
866 
867   /// The induction variable of the old basic block.
868   PHINode *OldInduction = nullptr;
869 
870   /// Maps values from the original loop to their corresponding values in the
871   /// vectorized loop. A key value can map to either vector values, scalar
872   /// values or both kinds of values, depending on whether the key was
873   /// vectorized and scalarized.
874   VectorizerValueMap VectorLoopValueMap;
875 
876   /// Store instructions that were predicated.
877   SmallVector<Instruction *, 4> PredicatedInstructions;
878 
879   /// Trip count of the original loop.
880   Value *TripCount = nullptr;
881 
882   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
883   Value *VectorTripCount = nullptr;
884 
885   /// The legality analysis.
886   LoopVectorizationLegality *Legal;
887 
888   /// The profitablity analysis.
889   LoopVectorizationCostModel *Cost;
890 
891   // Record whether runtime checks are added.
892   bool AddedSafetyChecks = false;
893 
894   // Holds the end values for each induction variable. We save the end values
895   // so we can later fix-up the external users of the induction variables.
896   DenseMap<PHINode *, Value *> IVEndValues;
897 
898   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
899   // fixed up at the end of vector code generation.
900   SmallVector<PHINode *, 8> OrigPHIsToFix;
901 
902   /// BFI and PSI are used to check for profile guided size optimizations.
903   BlockFrequencyInfo *BFI;
904   ProfileSummaryInfo *PSI;
905 
906   // Whether this loop should be optimized for size based on profile guided size
907   // optimizatios.
908   bool OptForSizeBasedOnProfile;
909 };
910 
911 class InnerLoopUnroller : public InnerLoopVectorizer {
912 public:
913   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
914                     LoopInfo *LI, DominatorTree *DT,
915                     const TargetLibraryInfo *TLI,
916                     const TargetTransformInfo *TTI, AssumptionCache *AC,
917                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
918                     LoopVectorizationLegality *LVL,
919                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
920                     ProfileSummaryInfo *PSI)
921       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
922                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
923                             BFI, PSI) {}
924 
925 private:
926   Value *getBroadcastInstrs(Value *V) override;
927   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
928                        Instruction::BinaryOps Opcode =
929                        Instruction::BinaryOpsEnd) override;
930   Value *reverseVector(Value *Vec) override;
931 };
932 
933 /// Encapsulate information regarding vectorization of a loop and its epilogue.
934 /// This information is meant to be updated and used across two stages of
935 /// epilogue vectorization.
936 struct EpilogueLoopVectorizationInfo {
937   ElementCount MainLoopVF = ElementCount::getFixed(0);
938   unsigned MainLoopUF = 0;
939   ElementCount EpilogueVF = ElementCount::getFixed(0);
940   unsigned EpilogueUF = 0;
941   BasicBlock *MainLoopIterationCountCheck = nullptr;
942   BasicBlock *EpilogueIterationCountCheck = nullptr;
943   BasicBlock *SCEVSafetyCheck = nullptr;
944   BasicBlock *MemSafetyCheck = nullptr;
945   Value *TripCount = nullptr;
946   Value *VectorTripCount = nullptr;
947 
948   EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
949                                 unsigned EUF)
950       : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
951         EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
952     assert(EUF == 1 &&
953            "A high UF for the epilogue loop is likely not beneficial.");
954   }
955 };
956 
957 /// An extension of the inner loop vectorizer that creates a skeleton for a
958 /// vectorized loop that has its epilogue (residual) also vectorized.
959 /// The idea is to run the vplan on a given loop twice, firstly to setup the
960 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
961 /// from the first step and vectorize the epilogue.  This is achieved by
962 /// deriving two concrete strategy classes from this base class and invoking
963 /// them in succession from the loop vectorizer planner.
964 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
965 public:
966   InnerLoopAndEpilogueVectorizer(
967       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
968       DominatorTree *DT, const TargetLibraryInfo *TLI,
969       const TargetTransformInfo *TTI, AssumptionCache *AC,
970       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
971       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
972       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
973       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
974                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
975         EPI(EPI) {}
976 
977   // Override this function to handle the more complex control flow around the
978   // three loops.
979   BasicBlock *createVectorizedLoopSkeleton() final override {
980     return createEpilogueVectorizedLoopSkeleton();
981   }
982 
983   /// The interface for creating a vectorized skeleton using one of two
984   /// different strategies, each corresponding to one execution of the vplan
985   /// as described above.
986   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
987 
988   /// Holds and updates state information required to vectorize the main loop
989   /// and its epilogue in two separate passes. This setup helps us avoid
990   /// regenerating and recomputing runtime safety checks. It also helps us to
991   /// shorten the iteration-count-check path length for the cases where the
992   /// iteration count of the loop is so small that the main vector loop is
993   /// completely skipped.
994   EpilogueLoopVectorizationInfo &EPI;
995 };
996 
997 /// A specialized derived class of inner loop vectorizer that performs
998 /// vectorization of *main* loops in the process of vectorizing loops and their
999 /// epilogues.
1000 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
1001 public:
1002   EpilogueVectorizerMainLoop(
1003       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
1004       DominatorTree *DT, const TargetLibraryInfo *TLI,
1005       const TargetTransformInfo *TTI, AssumptionCache *AC,
1006       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
1007       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
1008       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
1009       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1010                                        EPI, LVL, CM, BFI, PSI) {}
1011   /// Implements the interface for creating a vectorized skeleton using the
1012   /// *main loop* strategy (ie the first pass of vplan execution).
1013   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1014 
1015 protected:
1016   /// Emits an iteration count bypass check once for the main loop (when \p
1017   /// ForEpilogue is false) and once for the epilogue loop (when \p
1018   /// ForEpilogue is true).
1019   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
1020                                              bool ForEpilogue);
1021   void printDebugTracesAtStart() override;
1022   void printDebugTracesAtEnd() override;
1023 };
1024 
1025 // A specialized derived class of inner loop vectorizer that performs
1026 // vectorization of *epilogue* loops in the process of vectorizing loops and
1027 // their epilogues.
1028 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
1029 public:
1030   EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
1031                     LoopInfo *LI, DominatorTree *DT,
1032                     const TargetLibraryInfo *TLI,
1033                     const TargetTransformInfo *TTI, AssumptionCache *AC,
1034                     OptimizationRemarkEmitter *ORE,
1035                     EpilogueLoopVectorizationInfo &EPI,
1036                     LoopVectorizationLegality *LVL,
1037                     llvm::LoopVectorizationCostModel *CM,
1038                     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
1039       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1040                                        EPI, LVL, CM, BFI, PSI) {}
1041   /// Implements the interface for creating a vectorized skeleton using the
1042   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1043   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1044 
1045 protected:
1046   /// Emits an iteration count bypass check after the main vector loop has
1047   /// finished to see if there are any iterations left to execute by either
1048   /// the vector epilogue or the scalar epilogue.
1049   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1050                                                       BasicBlock *Bypass,
1051                                                       BasicBlock *Insert);
1052   void printDebugTracesAtStart() override;
1053   void printDebugTracesAtEnd() override;
1054 };
1055 } // end namespace llvm
1056 
1057 /// Look for a meaningful debug location on the instruction or it's
1058 /// operands.
1059 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1060   if (!I)
1061     return I;
1062 
1063   DebugLoc Empty;
1064   if (I->getDebugLoc() != Empty)
1065     return I;
1066 
1067   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
1068     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
1069       if (OpInst->getDebugLoc() != Empty)
1070         return OpInst;
1071   }
1072 
1073   return I;
1074 }
1075 
1076 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1077   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1078     const DILocation *DIL = Inst->getDebugLoc();
1079     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1080         !isa<DbgInfoIntrinsic>(Inst)) {
1081       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1082       auto NewDIL =
1083           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1084       if (NewDIL)
1085         B.SetCurrentDebugLocation(NewDIL.getValue());
1086       else
1087         LLVM_DEBUG(dbgs()
1088                    << "Failed to create new discriminator: "
1089                    << DIL->getFilename() << " Line: " << DIL->getLine());
1090     }
1091     else
1092       B.SetCurrentDebugLocation(DIL);
1093   } else
1094     B.SetCurrentDebugLocation(DebugLoc());
1095 }
1096 
1097 /// Write a record \p DebugMsg about vectorization failure to the debug
1098 /// output stream. If \p I is passed, it is an instruction that prevents
1099 /// vectorization.
1100 #ifndef NDEBUG
1101 static void debugVectorizationFailure(const StringRef DebugMsg,
1102     Instruction *I) {
1103   dbgs() << "LV: Not vectorizing: " << DebugMsg;
1104   if (I != nullptr)
1105     dbgs() << " " << *I;
1106   else
1107     dbgs() << '.';
1108   dbgs() << '\n';
1109 }
1110 #endif
1111 
1112 /// Create an analysis remark that explains why vectorization failed
1113 ///
1114 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1115 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1116 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1117 /// the location of the remark.  \return the remark object that can be
1118 /// streamed to.
1119 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1120     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1121   Value *CodeRegion = TheLoop->getHeader();
1122   DebugLoc DL = TheLoop->getStartLoc();
1123 
1124   if (I) {
1125     CodeRegion = I->getParent();
1126     // If there is no debug location attached to the instruction, revert back to
1127     // using the loop's.
1128     if (I->getDebugLoc())
1129       DL = I->getDebugLoc();
1130   }
1131 
1132   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1133   R << "loop not vectorized: ";
1134   return R;
1135 }
1136 
1137 /// Return a value for Step multiplied by VF.
1138 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1139   assert(isa<ConstantInt>(Step) && "Expected an integer step");
1140   Constant *StepVal = ConstantInt::get(
1141       Step->getType(),
1142       cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1143   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1144 }
1145 
1146 namespace llvm {
1147 
1148 void reportVectorizationFailure(const StringRef DebugMsg,
1149     const StringRef OREMsg, const StringRef ORETag,
1150     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1151   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
1152   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1153   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1154                 ORETag, TheLoop, I) << OREMsg);
1155 }
1156 
1157 } // end namespace llvm
1158 
1159 #ifndef NDEBUG
1160 /// \return string containing a file name and a line # for the given loop.
1161 static std::string getDebugLocString(const Loop *L) {
1162   std::string Result;
1163   if (L) {
1164     raw_string_ostream OS(Result);
1165     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1166       LoopDbgLoc.print(OS);
1167     else
1168       // Just print the module name.
1169       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1170     OS.flush();
1171   }
1172   return Result;
1173 }
1174 #endif
1175 
1176 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1177                                          const Instruction *Orig) {
1178   // If the loop was versioned with memchecks, add the corresponding no-alias
1179   // metadata.
1180   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1181     LVer->annotateInstWithNoAlias(To, Orig);
1182 }
1183 
1184 void InnerLoopVectorizer::addMetadata(Instruction *To,
1185                                       Instruction *From) {
1186   propagateMetadata(To, From);
1187   addNewMetadata(To, From);
1188 }
1189 
1190 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1191                                       Instruction *From) {
1192   for (Value *V : To) {
1193     if (Instruction *I = dyn_cast<Instruction>(V))
1194       addMetadata(I, From);
1195   }
1196 }
1197 
1198 namespace llvm {
1199 
1200 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1201 // lowered.
1202 enum ScalarEpilogueLowering {
1203 
1204   // The default: allowing scalar epilogues.
1205   CM_ScalarEpilogueAllowed,
1206 
1207   // Vectorization with OptForSize: don't allow epilogues.
1208   CM_ScalarEpilogueNotAllowedOptSize,
1209 
1210   // A special case of vectorisation with OptForSize: loops with a very small
1211   // trip count are considered for vectorization under OptForSize, thereby
1212   // making sure the cost of their loop body is dominant, free of runtime
1213   // guards and scalar iteration overheads.
1214   CM_ScalarEpilogueNotAllowedLowTripLoop,
1215 
1216   // Loop hint predicate indicating an epilogue is undesired.
1217   CM_ScalarEpilogueNotNeededUsePredicate,
1218 
1219   // Directive indicating we must either tail fold or not vectorize
1220   CM_ScalarEpilogueNotAllowedUsePredicate
1221 };
1222 
1223 /// LoopVectorizationCostModel - estimates the expected speedups due to
1224 /// vectorization.
1225 /// In many cases vectorization is not profitable. This can happen because of
1226 /// a number of reasons. In this class we mainly attempt to predict the
1227 /// expected speedup/slowdowns due to the supported instruction set. We use the
1228 /// TargetTransformInfo to query the different backends for the cost of
1229 /// different operations.
1230 class LoopVectorizationCostModel {
1231 public:
1232   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1233                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1234                              LoopVectorizationLegality *Legal,
1235                              const TargetTransformInfo &TTI,
1236                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1237                              AssumptionCache *AC,
1238                              OptimizationRemarkEmitter *ORE, const Function *F,
1239                              const LoopVectorizeHints *Hints,
1240                              InterleavedAccessInfo &IAI)
1241       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1242         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1243         Hints(Hints), InterleaveInfo(IAI) {}
1244 
1245   /// \return An upper bound for the vectorization factor, or None if
1246   /// vectorization and interleaving should be avoided up front.
1247   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1248 
1249   /// \return True if runtime checks are required for vectorization, and false
1250   /// otherwise.
1251   bool runtimeChecksRequired();
1252 
1253   /// \return The most profitable vectorization factor and the cost of that VF.
1254   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1255   /// then this vectorization factor will be selected if vectorization is
1256   /// possible.
1257   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1258   VectorizationFactor
1259   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1260                                     const LoopVectorizationPlanner &LVP);
1261 
1262   /// Setup cost-based decisions for user vectorization factor.
1263   void selectUserVectorizationFactor(ElementCount UserVF) {
1264     collectUniformsAndScalars(UserVF);
1265     collectInstsToScalarize(UserVF);
1266   }
1267 
1268   /// \return The size (in bits) of the smallest and widest types in the code
1269   /// that needs to be vectorized. We ignore values that remain scalar such as
1270   /// 64 bit loop indices.
1271   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1272 
1273   /// \return The desired interleave count.
1274   /// If interleave count has been specified by metadata it will be returned.
1275   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1276   /// are the selected vectorization factor and the cost of the selected VF.
1277   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1278 
1279   /// Memory access instruction may be vectorized in more than one way.
1280   /// Form of instruction after vectorization depends on cost.
1281   /// This function takes cost-based decisions for Load/Store instructions
1282   /// and collects them in a map. This decisions map is used for building
1283   /// the lists of loop-uniform and loop-scalar instructions.
1284   /// The calculated cost is saved with widening decision in order to
1285   /// avoid redundant calculations.
1286   void setCostBasedWideningDecision(ElementCount VF);
1287 
1288   /// A struct that represents some properties of the register usage
1289   /// of a loop.
1290   struct RegisterUsage {
1291     /// Holds the number of loop invariant values that are used in the loop.
1292     /// The key is ClassID of target-provided register class.
1293     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1294     /// Holds the maximum number of concurrent live intervals in the loop.
1295     /// The key is ClassID of target-provided register class.
1296     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1297   };
1298 
1299   /// \return Returns information about the register usages of the loop for the
1300   /// given vectorization factors.
1301   SmallVector<RegisterUsage, 8>
1302   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1303 
1304   /// Collect values we want to ignore in the cost model.
1305   void collectValuesToIgnore();
1306 
1307   /// Split reductions into those that happen in the loop, and those that happen
1308   /// outside. In loop reductions are collected into InLoopReductionChains.
1309   void collectInLoopReductions();
1310 
1311   /// \returns The smallest bitwidth each instruction can be represented with.
1312   /// The vector equivalents of these instructions should be truncated to this
1313   /// type.
1314   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1315     return MinBWs;
1316   }
1317 
1318   /// \returns True if it is more profitable to scalarize instruction \p I for
1319   /// vectorization factor \p VF.
1320   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1321     assert(VF.isVector() &&
1322            "Profitable to scalarize relevant only for VF > 1.");
1323 
1324     // Cost model is not run in the VPlan-native path - return conservative
1325     // result until this changes.
1326     if (EnableVPlanNativePath)
1327       return false;
1328 
1329     auto Scalars = InstsToScalarize.find(VF);
1330     assert(Scalars != InstsToScalarize.end() &&
1331            "VF not yet analyzed for scalarization profitability");
1332     return Scalars->second.find(I) != Scalars->second.end();
1333   }
1334 
1335   /// Returns true if \p I is known to be uniform after vectorization.
1336   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1337     if (VF.isScalar())
1338       return true;
1339 
1340     // Cost model is not run in the VPlan-native path - return conservative
1341     // result until this changes.
1342     if (EnableVPlanNativePath)
1343       return false;
1344 
1345     auto UniformsPerVF = Uniforms.find(VF);
1346     assert(UniformsPerVF != Uniforms.end() &&
1347            "VF not yet analyzed for uniformity");
1348     return UniformsPerVF->second.count(I);
1349   }
1350 
1351   /// Returns true if \p I is known to be scalar after vectorization.
1352   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1353     if (VF.isScalar())
1354       return true;
1355 
1356     // Cost model is not run in the VPlan-native path - return conservative
1357     // result until this changes.
1358     if (EnableVPlanNativePath)
1359       return false;
1360 
1361     auto ScalarsPerVF = Scalars.find(VF);
1362     assert(ScalarsPerVF != Scalars.end() &&
1363            "Scalar values are not calculated for VF");
1364     return ScalarsPerVF->second.count(I);
1365   }
1366 
1367   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1368   /// for vectorization factor \p VF.
1369   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1370     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1371            !isProfitableToScalarize(I, VF) &&
1372            !isScalarAfterVectorization(I, VF);
1373   }
1374 
1375   /// Decision that was taken during cost calculation for memory instruction.
1376   enum InstWidening {
1377     CM_Unknown,
1378     CM_Widen,         // For consecutive accesses with stride +1.
1379     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1380     CM_Interleave,
1381     CM_GatherScatter,
1382     CM_Scalarize
1383   };
1384 
1385   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1386   /// instruction \p I and vector width \p VF.
1387   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1388                            InstructionCost Cost) {
1389     assert(VF.isVector() && "Expected VF >=2");
1390     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1391   }
1392 
1393   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1394   /// interleaving group \p Grp and vector width \p VF.
1395   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1396                            ElementCount VF, InstWidening W,
1397                            InstructionCost Cost) {
1398     assert(VF.isVector() && "Expected VF >=2");
1399     /// Broadcast this decicion to all instructions inside the group.
1400     /// But the cost will be assigned to one instruction only.
1401     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1402       if (auto *I = Grp->getMember(i)) {
1403         if (Grp->getInsertPos() == I)
1404           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1405         else
1406           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1407       }
1408     }
1409   }
1410 
1411   /// Return the cost model decision for the given instruction \p I and vector
1412   /// width \p VF. Return CM_Unknown if this instruction did not pass
1413   /// through the cost modeling.
1414   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1415     assert(VF.isVector() && "Expected VF to be a vector VF");
1416     // Cost model is not run in the VPlan-native path - return conservative
1417     // result until this changes.
1418     if (EnableVPlanNativePath)
1419       return CM_GatherScatter;
1420 
1421     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1422     auto Itr = WideningDecisions.find(InstOnVF);
1423     if (Itr == WideningDecisions.end())
1424       return CM_Unknown;
1425     return Itr->second.first;
1426   }
1427 
1428   /// Return the vectorization cost for the given instruction \p I and vector
1429   /// width \p VF.
1430   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1431     assert(VF.isVector() && "Expected VF >=2");
1432     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1433     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1434            "The cost is not calculated");
1435     return WideningDecisions[InstOnVF].second;
1436   }
1437 
1438   /// Return True if instruction \p I is an optimizable truncate whose operand
1439   /// is an induction variable. Such a truncate will be removed by adding a new
1440   /// induction variable with the destination type.
1441   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1442     // If the instruction is not a truncate, return false.
1443     auto *Trunc = dyn_cast<TruncInst>(I);
1444     if (!Trunc)
1445       return false;
1446 
1447     // Get the source and destination types of the truncate.
1448     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1449     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1450 
1451     // If the truncate is free for the given types, return false. Replacing a
1452     // free truncate with an induction variable would add an induction variable
1453     // update instruction to each iteration of the loop. We exclude from this
1454     // check the primary induction variable since it will need an update
1455     // instruction regardless.
1456     Value *Op = Trunc->getOperand(0);
1457     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1458       return false;
1459 
1460     // If the truncated value is not an induction variable, return false.
1461     return Legal->isInductionPhi(Op);
1462   }
1463 
1464   /// Collects the instructions to scalarize for each predicated instruction in
1465   /// the loop.
1466   void collectInstsToScalarize(ElementCount VF);
1467 
1468   /// Collect Uniform and Scalar values for the given \p VF.
1469   /// The sets depend on CM decision for Load/Store instructions
1470   /// that may be vectorized as interleave, gather-scatter or scalarized.
1471   void collectUniformsAndScalars(ElementCount VF) {
1472     // Do the analysis once.
1473     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1474       return;
1475     setCostBasedWideningDecision(VF);
1476     collectLoopUniforms(VF);
1477     collectLoopScalars(VF);
1478   }
1479 
1480   /// Returns true if the target machine supports masked store operation
1481   /// for the given \p DataType and kind of access to \p Ptr.
1482   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1483     return Legal->isConsecutivePtr(Ptr) &&
1484            TTI.isLegalMaskedStore(DataType, Alignment);
1485   }
1486 
1487   /// Returns true if the target machine supports masked load operation
1488   /// for the given \p DataType and kind of access to \p Ptr.
1489   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1490     return Legal->isConsecutivePtr(Ptr) &&
1491            TTI.isLegalMaskedLoad(DataType, Alignment);
1492   }
1493 
1494   /// Returns true if the target machine supports masked scatter operation
1495   /// for the given \p DataType.
1496   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1497     return TTI.isLegalMaskedScatter(DataType, Alignment);
1498   }
1499 
1500   /// Returns true if the target machine supports masked gather operation
1501   /// for the given \p DataType.
1502   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1503     return TTI.isLegalMaskedGather(DataType, Alignment);
1504   }
1505 
1506   /// Returns true if the target machine can represent \p V as a masked gather
1507   /// or scatter operation.
1508   bool isLegalGatherOrScatter(Value *V) {
1509     bool LI = isa<LoadInst>(V);
1510     bool SI = isa<StoreInst>(V);
1511     if (!LI && !SI)
1512       return false;
1513     auto *Ty = getMemInstValueType(V);
1514     Align Align = getLoadStoreAlignment(V);
1515     return (LI && isLegalMaskedGather(Ty, Align)) ||
1516            (SI && isLegalMaskedScatter(Ty, Align));
1517   }
1518 
1519   /// Returns true if \p I is an instruction that will be scalarized with
1520   /// predication. Such instructions include conditional stores and
1521   /// instructions that may divide by zero.
1522   /// If a non-zero VF has been calculated, we check if I will be scalarized
1523   /// predication for that VF.
1524   bool isScalarWithPredication(Instruction *I,
1525                                ElementCount VF = ElementCount::getFixed(1));
1526 
1527   // Returns true if \p I is an instruction that will be predicated either
1528   // through scalar predication or masked load/store or masked gather/scatter.
1529   // Superset of instructions that return true for isScalarWithPredication.
1530   bool isPredicatedInst(Instruction *I) {
1531     if (!blockNeedsPredication(I->getParent()))
1532       return false;
1533     // Loads and stores that need some form of masked operation are predicated
1534     // instructions.
1535     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1536       return Legal->isMaskRequired(I);
1537     return isScalarWithPredication(I);
1538   }
1539 
1540   /// Returns true if \p I is a memory instruction with consecutive memory
1541   /// access that can be widened.
1542   bool
1543   memoryInstructionCanBeWidened(Instruction *I,
1544                                 ElementCount VF = ElementCount::getFixed(1));
1545 
1546   /// Returns true if \p I is a memory instruction in an interleaved-group
1547   /// of memory accesses that can be vectorized with wide vector loads/stores
1548   /// and shuffles.
1549   bool
1550   interleavedAccessCanBeWidened(Instruction *I,
1551                                 ElementCount VF = ElementCount::getFixed(1));
1552 
1553   /// Check if \p Instr belongs to any interleaved access group.
1554   bool isAccessInterleaved(Instruction *Instr) {
1555     return InterleaveInfo.isInterleaved(Instr);
1556   }
1557 
1558   /// Get the interleaved access group that \p Instr belongs to.
1559   const InterleaveGroup<Instruction> *
1560   getInterleavedAccessGroup(Instruction *Instr) {
1561     return InterleaveInfo.getInterleaveGroup(Instr);
1562   }
1563 
1564   /// Returns true if we're required to use a scalar epilogue for at least
1565   /// the final iteration of the original loop.
1566   bool requiresScalarEpilogue() const {
1567     if (!isScalarEpilogueAllowed())
1568       return false;
1569     // If we might exit from anywhere but the latch, must run the exiting
1570     // iteration in scalar form.
1571     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1572       return true;
1573     return InterleaveInfo.requiresScalarEpilogue();
1574   }
1575 
1576   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1577   /// loop hint annotation.
1578   bool isScalarEpilogueAllowed() const {
1579     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1580   }
1581 
1582   /// Returns true if all loop blocks should be masked to fold tail loop.
1583   bool foldTailByMasking() const { return FoldTailByMasking; }
1584 
1585   bool blockNeedsPredication(BasicBlock *BB) {
1586     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1587   }
1588 
1589   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1590   /// nodes to the chain of instructions representing the reductions. Uses a
1591   /// MapVector to ensure deterministic iteration order.
1592   using ReductionChainMap =
1593       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1594 
1595   /// Return the chain of instructions representing an inloop reduction.
1596   const ReductionChainMap &getInLoopReductionChains() const {
1597     return InLoopReductionChains;
1598   }
1599 
1600   /// Returns true if the Phi is part of an inloop reduction.
1601   bool isInLoopReduction(PHINode *Phi) const {
1602     return InLoopReductionChains.count(Phi);
1603   }
1604 
1605   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1606   /// with factor VF.  Return the cost of the instruction, including
1607   /// scalarization overhead if it's needed.
1608   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1609 
1610   /// Estimate cost of a call instruction CI if it were vectorized with factor
1611   /// VF. Return the cost of the instruction, including scalarization overhead
1612   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1613   /// scalarized -
1614   /// i.e. either vector version isn't available, or is too expensive.
1615   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1616                                     bool &NeedToScalarize);
1617 
1618   /// Invalidates decisions already taken by the cost model.
1619   void invalidateCostModelingDecisions() {
1620     WideningDecisions.clear();
1621     Uniforms.clear();
1622     Scalars.clear();
1623   }
1624 
1625 private:
1626   unsigned NumPredStores = 0;
1627 
1628   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1629   /// than zero. One is returned if vectorization should best be avoided due
1630   /// to cost.
1631   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1632                                     ElementCount UserVF);
1633 
1634   /// The vectorization cost is a combination of the cost itself and a boolean
1635   /// indicating whether any of the contributing operations will actually
1636   /// operate on
1637   /// vector values after type legalization in the backend. If this latter value
1638   /// is
1639   /// false, then all operations will be scalarized (i.e. no vectorization has
1640   /// actually taken place).
1641   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1642 
1643   /// Returns the expected execution cost. The unit of the cost does
1644   /// not matter because we use the 'cost' units to compare different
1645   /// vector widths. The cost that is returned is *not* normalized by
1646   /// the factor width.
1647   VectorizationCostTy expectedCost(ElementCount VF);
1648 
1649   /// Returns the execution time cost of an instruction for a given vector
1650   /// width. Vector width of one means scalar.
1651   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1652 
1653   /// The cost-computation logic from getInstructionCost which provides
1654   /// the vector type as an output parameter.
1655   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1656                                      Type *&VectorTy);
1657 
1658   /// Calculate vectorization cost of memory instruction \p I.
1659   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1660 
1661   /// The cost computation for scalarized memory instruction.
1662   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1663 
1664   /// The cost computation for interleaving group of memory instructions.
1665   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1666 
1667   /// The cost computation for Gather/Scatter instruction.
1668   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1669 
1670   /// The cost computation for widening instruction \p I with consecutive
1671   /// memory access.
1672   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1673 
1674   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1675   /// Load: scalar load + broadcast.
1676   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1677   /// element)
1678   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1679 
1680   /// Estimate the overhead of scalarizing an instruction. This is a
1681   /// convenience wrapper for the type-based getScalarizationOverhead API.
1682   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF);
1683 
1684   /// Returns whether the instruction is a load or store and will be a emitted
1685   /// as a vector operation.
1686   bool isConsecutiveLoadOrStore(Instruction *I);
1687 
1688   /// Returns true if an artificially high cost for emulated masked memrefs
1689   /// should be used.
1690   bool useEmulatedMaskMemRefHack(Instruction *I);
1691 
1692   /// Map of scalar integer values to the smallest bitwidth they can be legally
1693   /// represented as. The vector equivalents of these values should be truncated
1694   /// to this type.
1695   MapVector<Instruction *, uint64_t> MinBWs;
1696 
1697   /// A type representing the costs for instructions if they were to be
1698   /// scalarized rather than vectorized. The entries are Instruction-Cost
1699   /// pairs.
1700   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1701 
1702   /// A set containing all BasicBlocks that are known to present after
1703   /// vectorization as a predicated block.
1704   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1705 
1706   /// Records whether it is allowed to have the original scalar loop execute at
1707   /// least once. This may be needed as a fallback loop in case runtime
1708   /// aliasing/dependence checks fail, or to handle the tail/remainder
1709   /// iterations when the trip count is unknown or doesn't divide by the VF,
1710   /// or as a peel-loop to handle gaps in interleave-groups.
1711   /// Under optsize and when the trip count is very small we don't allow any
1712   /// iterations to execute in the scalar loop.
1713   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1714 
1715   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1716   bool FoldTailByMasking = false;
1717 
1718   /// A map holding scalar costs for different vectorization factors. The
1719   /// presence of a cost for an instruction in the mapping indicates that the
1720   /// instruction will be scalarized when vectorizing with the associated
1721   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1722   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1723 
1724   /// Holds the instructions known to be uniform after vectorization.
1725   /// The data is collected per VF.
1726   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1727 
1728   /// Holds the instructions known to be scalar after vectorization.
1729   /// The data is collected per VF.
1730   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1731 
1732   /// Holds the instructions (address computations) that are forced to be
1733   /// scalarized.
1734   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1735 
1736   /// PHINodes of the reductions that should be expanded in-loop along with
1737   /// their associated chains of reduction operations, in program order from top
1738   /// (PHI) to bottom
1739   ReductionChainMap InLoopReductionChains;
1740 
1741   /// Returns the expected difference in cost from scalarizing the expression
1742   /// feeding a predicated instruction \p PredInst. The instructions to
1743   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1744   /// non-negative return value implies the expression will be scalarized.
1745   /// Currently, only single-use chains are considered for scalarization.
1746   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1747                               ElementCount VF);
1748 
1749   /// Collect the instructions that are uniform after vectorization. An
1750   /// instruction is uniform if we represent it with a single scalar value in
1751   /// the vectorized loop corresponding to each vector iteration. Examples of
1752   /// uniform instructions include pointer operands of consecutive or
1753   /// interleaved memory accesses. Note that although uniformity implies an
1754   /// instruction will be scalar, the reverse is not true. In general, a
1755   /// scalarized instruction will be represented by VF scalar values in the
1756   /// vectorized loop, each corresponding to an iteration of the original
1757   /// scalar loop.
1758   void collectLoopUniforms(ElementCount VF);
1759 
1760   /// Collect the instructions that are scalar after vectorization. An
1761   /// instruction is scalar if it is known to be uniform or will be scalarized
1762   /// during vectorization. Non-uniform scalarized instructions will be
1763   /// represented by VF values in the vectorized loop, each corresponding to an
1764   /// iteration of the original scalar loop.
1765   void collectLoopScalars(ElementCount VF);
1766 
1767   /// Keeps cost model vectorization decision and cost for instructions.
1768   /// Right now it is used for memory instructions only.
1769   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1770                                 std::pair<InstWidening, InstructionCost>>;
1771 
1772   DecisionList WideningDecisions;
1773 
1774   /// Returns true if \p V is expected to be vectorized and it needs to be
1775   /// extracted.
1776   bool needsExtract(Value *V, ElementCount VF) const {
1777     Instruction *I = dyn_cast<Instruction>(V);
1778     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1779         TheLoop->isLoopInvariant(I))
1780       return false;
1781 
1782     // Assume we can vectorize V (and hence we need extraction) if the
1783     // scalars are not computed yet. This can happen, because it is called
1784     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1785     // the scalars are collected. That should be a safe assumption in most
1786     // cases, because we check if the operands have vectorizable types
1787     // beforehand in LoopVectorizationLegality.
1788     return Scalars.find(VF) == Scalars.end() ||
1789            !isScalarAfterVectorization(I, VF);
1790   };
1791 
1792   /// Returns a range containing only operands needing to be extracted.
1793   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1794                                                    ElementCount VF) {
1795     return SmallVector<Value *, 4>(make_filter_range(
1796         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1797   }
1798 
1799   /// Determines if we have the infrastructure to vectorize loop \p L and its
1800   /// epilogue, assuming the main loop is vectorized by \p VF.
1801   bool isCandidateForEpilogueVectorization(const Loop &L,
1802                                            const ElementCount VF) const;
1803 
1804   /// Returns true if epilogue vectorization is considered profitable, and
1805   /// false otherwise.
1806   /// \p VF is the vectorization factor chosen for the original loop.
1807   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1808 
1809 public:
1810   /// The loop that we evaluate.
1811   Loop *TheLoop;
1812 
1813   /// Predicated scalar evolution analysis.
1814   PredicatedScalarEvolution &PSE;
1815 
1816   /// Loop Info analysis.
1817   LoopInfo *LI;
1818 
1819   /// Vectorization legality.
1820   LoopVectorizationLegality *Legal;
1821 
1822   /// Vector target information.
1823   const TargetTransformInfo &TTI;
1824 
1825   /// Target Library Info.
1826   const TargetLibraryInfo *TLI;
1827 
1828   /// Demanded bits analysis.
1829   DemandedBits *DB;
1830 
1831   /// Assumption cache.
1832   AssumptionCache *AC;
1833 
1834   /// Interface to emit optimization remarks.
1835   OptimizationRemarkEmitter *ORE;
1836 
1837   const Function *TheFunction;
1838 
1839   /// Loop Vectorize Hint.
1840   const LoopVectorizeHints *Hints;
1841 
1842   /// The interleave access information contains groups of interleaved accesses
1843   /// with the same stride and close to each other.
1844   InterleavedAccessInfo &InterleaveInfo;
1845 
1846   /// Values to ignore in the cost model.
1847   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1848 
1849   /// Values to ignore in the cost model when VF > 1.
1850   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1851 
1852   /// Profitable vector factors.
1853   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1854 };
1855 
1856 } // end namespace llvm
1857 
1858 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1859 // vectorization. The loop needs to be annotated with #pragma omp simd
1860 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1861 // vector length information is not provided, vectorization is not considered
1862 // explicit. Interleave hints are not allowed either. These limitations will be
1863 // relaxed in the future.
1864 // Please, note that we are currently forced to abuse the pragma 'clang
1865 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1866 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1867 // provides *explicit vectorization hints* (LV can bypass legal checks and
1868 // assume that vectorization is legal). However, both hints are implemented
1869 // using the same metadata (llvm.loop.vectorize, processed by
1870 // LoopVectorizeHints). This will be fixed in the future when the native IR
1871 // representation for pragma 'omp simd' is introduced.
1872 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1873                                    OptimizationRemarkEmitter *ORE) {
1874   assert(!OuterLp->isInnermost() && "This is not an outer loop");
1875   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1876 
1877   // Only outer loops with an explicit vectorization hint are supported.
1878   // Unannotated outer loops are ignored.
1879   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1880     return false;
1881 
1882   Function *Fn = OuterLp->getHeader()->getParent();
1883   if (!Hints.allowVectorization(Fn, OuterLp,
1884                                 true /*VectorizeOnlyWhenForced*/)) {
1885     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1886     return false;
1887   }
1888 
1889   if (Hints.getInterleave() > 1) {
1890     // TODO: Interleave support is future work.
1891     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1892                          "outer loops.\n");
1893     Hints.emitRemarkWithHints();
1894     return false;
1895   }
1896 
1897   return true;
1898 }
1899 
1900 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1901                                   OptimizationRemarkEmitter *ORE,
1902                                   SmallVectorImpl<Loop *> &V) {
1903   // Collect inner loops and outer loops without irreducible control flow. For
1904   // now, only collect outer loops that have explicit vectorization hints. If we
1905   // are stress testing the VPlan H-CFG construction, we collect the outermost
1906   // loop of every loop nest.
1907   if (L.isInnermost() || VPlanBuildStressTest ||
1908       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1909     LoopBlocksRPO RPOT(&L);
1910     RPOT.perform(LI);
1911     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1912       V.push_back(&L);
1913       // TODO: Collect inner loops inside marked outer loops in case
1914       // vectorization fails for the outer loop. Do not invoke
1915       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1916       // already known to be reducible. We can use an inherited attribute for
1917       // that.
1918       return;
1919     }
1920   }
1921   for (Loop *InnerL : L)
1922     collectSupportedLoops(*InnerL, LI, ORE, V);
1923 }
1924 
1925 namespace {
1926 
1927 /// The LoopVectorize Pass.
1928 struct LoopVectorize : public FunctionPass {
1929   /// Pass identification, replacement for typeid
1930   static char ID;
1931 
1932   LoopVectorizePass Impl;
1933 
1934   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1935                          bool VectorizeOnlyWhenForced = false)
1936       : FunctionPass(ID),
1937         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1938     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1939   }
1940 
1941   bool runOnFunction(Function &F) override {
1942     if (skipFunction(F))
1943       return false;
1944 
1945     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1946     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1947     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1948     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1949     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1950     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1951     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1952     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1953     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1954     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1955     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1956     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1957     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1958 
1959     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1960         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1961 
1962     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1963                         GetLAA, *ORE, PSI).MadeAnyChange;
1964   }
1965 
1966   void getAnalysisUsage(AnalysisUsage &AU) const override {
1967     AU.addRequired<AssumptionCacheTracker>();
1968     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1969     AU.addRequired<DominatorTreeWrapperPass>();
1970     AU.addRequired<LoopInfoWrapperPass>();
1971     AU.addRequired<ScalarEvolutionWrapperPass>();
1972     AU.addRequired<TargetTransformInfoWrapperPass>();
1973     AU.addRequired<AAResultsWrapperPass>();
1974     AU.addRequired<LoopAccessLegacyAnalysis>();
1975     AU.addRequired<DemandedBitsWrapperPass>();
1976     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1977     AU.addRequired<InjectTLIMappingsLegacy>();
1978 
1979     // We currently do not preserve loopinfo/dominator analyses with outer loop
1980     // vectorization. Until this is addressed, mark these analyses as preserved
1981     // only for non-VPlan-native path.
1982     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1983     if (!EnableVPlanNativePath) {
1984       AU.addPreserved<LoopInfoWrapperPass>();
1985       AU.addPreserved<DominatorTreeWrapperPass>();
1986     }
1987 
1988     AU.addPreserved<BasicAAWrapperPass>();
1989     AU.addPreserved<GlobalsAAWrapperPass>();
1990     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1991   }
1992 };
1993 
1994 } // end anonymous namespace
1995 
1996 //===----------------------------------------------------------------------===//
1997 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1998 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1999 //===----------------------------------------------------------------------===//
2000 
2001 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2002   // We need to place the broadcast of invariant variables outside the loop,
2003   // but only if it's proven safe to do so. Else, broadcast will be inside
2004   // vector loop body.
2005   Instruction *Instr = dyn_cast<Instruction>(V);
2006   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2007                      (!Instr ||
2008                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2009   // Place the code for broadcasting invariant variables in the new preheader.
2010   IRBuilder<>::InsertPointGuard Guard(Builder);
2011   if (SafeToHoist)
2012     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2013 
2014   // Broadcast the scalar into all locations in the vector.
2015   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2016 
2017   return Shuf;
2018 }
2019 
2020 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2021     const InductionDescriptor &II, Value *Step, Value *Start,
2022     Instruction *EntryVal) {
2023   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2024          "Expected either an induction phi-node or a truncate of it!");
2025 
2026   // Construct the initial value of the vector IV in the vector loop preheader
2027   auto CurrIP = Builder.saveIP();
2028   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2029   if (isa<TruncInst>(EntryVal)) {
2030     assert(Start->getType()->isIntegerTy() &&
2031            "Truncation requires an integer type");
2032     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2033     Step = Builder.CreateTrunc(Step, TruncType);
2034     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2035   }
2036   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2037   Value *SteppedStart =
2038       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2039 
2040   // We create vector phi nodes for both integer and floating-point induction
2041   // variables. Here, we determine the kind of arithmetic we will perform.
2042   Instruction::BinaryOps AddOp;
2043   Instruction::BinaryOps MulOp;
2044   if (Step->getType()->isIntegerTy()) {
2045     AddOp = Instruction::Add;
2046     MulOp = Instruction::Mul;
2047   } else {
2048     AddOp = II.getInductionOpcode();
2049     MulOp = Instruction::FMul;
2050   }
2051 
2052   // Multiply the vectorization factor by the step using integer or
2053   // floating-point arithmetic as appropriate.
2054   Value *ConstVF =
2055       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
2056   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2057 
2058   // Create a vector splat to use in the induction update.
2059   //
2060   // FIXME: If the step is non-constant, we create the vector splat with
2061   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2062   //        handle a constant vector splat.
2063   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2064   Value *SplatVF = isa<Constant>(Mul)
2065                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2066                        : Builder.CreateVectorSplat(VF, Mul);
2067   Builder.restoreIP(CurrIP);
2068 
2069   // We may need to add the step a number of times, depending on the unroll
2070   // factor. The last of those goes into the PHI.
2071   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2072                                     &*LoopVectorBody->getFirstInsertionPt());
2073   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2074   Instruction *LastInduction = VecInd;
2075   for (unsigned Part = 0; Part < UF; ++Part) {
2076     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2077 
2078     if (isa<TruncInst>(EntryVal))
2079       addMetadata(LastInduction, EntryVal);
2080     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
2081 
2082     LastInduction = cast<Instruction>(addFastMathFlag(
2083         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2084     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2085   }
2086 
2087   // Move the last step to the end of the latch block. This ensures consistent
2088   // placement of all induction updates.
2089   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2090   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2091   auto *ICmp = cast<Instruction>(Br->getCondition());
2092   LastInduction->moveBefore(ICmp);
2093   LastInduction->setName("vec.ind.next");
2094 
2095   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2096   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2097 }
2098 
2099 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2100   return Cost->isScalarAfterVectorization(I, VF) ||
2101          Cost->isProfitableToScalarize(I, VF);
2102 }
2103 
2104 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2105   if (shouldScalarizeInstruction(IV))
2106     return true;
2107   auto isScalarInst = [&](User *U) -> bool {
2108     auto *I = cast<Instruction>(U);
2109     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2110   };
2111   return llvm::any_of(IV->users(), isScalarInst);
2112 }
2113 
2114 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2115     const InductionDescriptor &ID, const Instruction *EntryVal,
2116     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
2117   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2118          "Expected either an induction phi-node or a truncate of it!");
2119 
2120   // This induction variable is not the phi from the original loop but the
2121   // newly-created IV based on the proof that casted Phi is equal to the
2122   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2123   // re-uses the same InductionDescriptor that original IV uses but we don't
2124   // have to do any recording in this case - that is done when original IV is
2125   // processed.
2126   if (isa<TruncInst>(EntryVal))
2127     return;
2128 
2129   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2130   if (Casts.empty())
2131     return;
2132   // Only the first Cast instruction in the Casts vector is of interest.
2133   // The rest of the Casts (if exist) have no uses outside the
2134   // induction update chain itself.
2135   Instruction *CastInst = *Casts.begin();
2136   if (Lane < UINT_MAX)
2137     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
2138   else
2139     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
2140 }
2141 
2142 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2143                                                 TruncInst *Trunc) {
2144   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2145          "Primary induction variable must have an integer type");
2146 
2147   auto II = Legal->getInductionVars().find(IV);
2148   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2149 
2150   auto ID = II->second;
2151   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2152 
2153   // The value from the original loop to which we are mapping the new induction
2154   // variable.
2155   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2156 
2157   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2158 
2159   // Generate code for the induction step. Note that induction steps are
2160   // required to be loop-invariant
2161   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2162     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2163            "Induction step should be loop invariant");
2164     if (PSE.getSE()->isSCEVable(IV->getType())) {
2165       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2166       return Exp.expandCodeFor(Step, Step->getType(),
2167                                LoopVectorPreHeader->getTerminator());
2168     }
2169     return cast<SCEVUnknown>(Step)->getValue();
2170   };
2171 
2172   // The scalar value to broadcast. This is derived from the canonical
2173   // induction variable. If a truncation type is given, truncate the canonical
2174   // induction variable and step. Otherwise, derive these values from the
2175   // induction descriptor.
2176   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2177     Value *ScalarIV = Induction;
2178     if (IV != OldInduction) {
2179       ScalarIV = IV->getType()->isIntegerTy()
2180                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2181                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2182                                           IV->getType());
2183       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2184       ScalarIV->setName("offset.idx");
2185     }
2186     if (Trunc) {
2187       auto *TruncType = cast<IntegerType>(Trunc->getType());
2188       assert(Step->getType()->isIntegerTy() &&
2189              "Truncation requires an integer step");
2190       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2191       Step = Builder.CreateTrunc(Step, TruncType);
2192     }
2193     return ScalarIV;
2194   };
2195 
2196   // Create the vector values from the scalar IV, in the absence of creating a
2197   // vector IV.
2198   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2199     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2200     for (unsigned Part = 0; Part < UF; ++Part) {
2201       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2202       Value *EntryPart =
2203           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2204                         ID.getInductionOpcode());
2205       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2206       if (Trunc)
2207         addMetadata(EntryPart, Trunc);
2208       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2209     }
2210   };
2211 
2212   // Now do the actual transformations, and start with creating the step value.
2213   Value *Step = CreateStepValue(ID.getStep());
2214   if (VF.isZero() || VF.isScalar()) {
2215     Value *ScalarIV = CreateScalarIV(Step);
2216     CreateSplatIV(ScalarIV, Step);
2217     return;
2218   }
2219 
2220   // Determine if we want a scalar version of the induction variable. This is
2221   // true if the induction variable itself is not widened, or if it has at
2222   // least one user in the loop that is not widened.
2223   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2224   if (!NeedsScalarIV) {
2225     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
2226     return;
2227   }
2228 
2229   // Try to create a new independent vector induction variable. If we can't
2230   // create the phi node, we will splat the scalar induction variable in each
2231   // loop iteration.
2232   if (!shouldScalarizeInstruction(EntryVal)) {
2233     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
2234     Value *ScalarIV = CreateScalarIV(Step);
2235     // Create scalar steps that can be used by instructions we will later
2236     // scalarize. Note that the addition of the scalar steps will not increase
2237     // the number of instructions in the loop in the common case prior to
2238     // InstCombine. We will be trading one vector extract for each scalar step.
2239     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2240     return;
2241   }
2242 
2243   // All IV users are scalar instructions, so only emit a scalar IV, not a
2244   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2245   // predicate used by the masked loads/stores.
2246   Value *ScalarIV = CreateScalarIV(Step);
2247   if (!Cost->isScalarEpilogueAllowed())
2248     CreateSplatIV(ScalarIV, Step);
2249   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2250 }
2251 
2252 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2253                                           Instruction::BinaryOps BinOp) {
2254   // Create and check the types.
2255   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2256   int VLen = ValVTy->getNumElements();
2257 
2258   Type *STy = Val->getType()->getScalarType();
2259   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2260          "Induction Step must be an integer or FP");
2261   assert(Step->getType() == STy && "Step has wrong type");
2262 
2263   SmallVector<Constant *, 8> Indices;
2264 
2265   if (STy->isIntegerTy()) {
2266     // Create a vector of consecutive numbers from zero to VF.
2267     for (int i = 0; i < VLen; ++i)
2268       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2269 
2270     // Add the consecutive indices to the vector value.
2271     Constant *Cv = ConstantVector::get(Indices);
2272     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2273     Step = Builder.CreateVectorSplat(VLen, Step);
2274     assert(Step->getType() == Val->getType() && "Invalid step vec");
2275     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2276     // which can be found from the original scalar operations.
2277     Step = Builder.CreateMul(Cv, Step);
2278     return Builder.CreateAdd(Val, Step, "induction");
2279   }
2280 
2281   // Floating point induction.
2282   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2283          "Binary Opcode should be specified for FP induction");
2284   // Create a vector of consecutive numbers from zero to VF.
2285   for (int i = 0; i < VLen; ++i)
2286     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2287 
2288   // Add the consecutive indices to the vector value.
2289   Constant *Cv = ConstantVector::get(Indices);
2290 
2291   Step = Builder.CreateVectorSplat(VLen, Step);
2292 
2293   // Floating point operations had to be 'fast' to enable the induction.
2294   FastMathFlags Flags;
2295   Flags.setFast();
2296 
2297   Value *MulOp = Builder.CreateFMul(Cv, Step);
2298   if (isa<Instruction>(MulOp))
2299     // Have to check, MulOp may be a constant
2300     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2301 
2302   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2303   if (isa<Instruction>(BOp))
2304     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2305   return BOp;
2306 }
2307 
2308 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2309                                            Instruction *EntryVal,
2310                                            const InductionDescriptor &ID) {
2311   // We shouldn't have to build scalar steps if we aren't vectorizing.
2312   assert(VF.isVector() && "VF should be greater than one");
2313   // Get the value type and ensure it and the step have the same integer type.
2314   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2315   assert(ScalarIVTy == Step->getType() &&
2316          "Val and Step should have the same type");
2317 
2318   // We build scalar steps for both integer and floating-point induction
2319   // variables. Here, we determine the kind of arithmetic we will perform.
2320   Instruction::BinaryOps AddOp;
2321   Instruction::BinaryOps MulOp;
2322   if (ScalarIVTy->isIntegerTy()) {
2323     AddOp = Instruction::Add;
2324     MulOp = Instruction::Mul;
2325   } else {
2326     AddOp = ID.getInductionOpcode();
2327     MulOp = Instruction::FMul;
2328   }
2329 
2330   // Determine the number of scalars we need to generate for each unroll
2331   // iteration. If EntryVal is uniform, we only need to generate the first
2332   // lane. Otherwise, we generate all VF values.
2333   unsigned Lanes =
2334       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2335           ? 1
2336           : VF.getKnownMinValue();
2337   assert((!VF.isScalable() || Lanes == 1) &&
2338          "Should never scalarize a scalable vector");
2339   // Compute the scalar steps and save the results in VectorLoopValueMap.
2340   for (unsigned Part = 0; Part < UF; ++Part) {
2341     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2342       auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2343                                          ScalarIVTy->getScalarSizeInBits());
2344       Value *StartIdx =
2345           createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2346       if (ScalarIVTy->isFloatingPointTy())
2347         StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
2348       StartIdx = addFastMathFlag(Builder.CreateBinOp(
2349           AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
2350       // The step returned by `createStepForVF` is a runtime-evaluated value
2351       // when VF is scalable. Otherwise, it should be folded into a Constant.
2352       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2353              "Expected StartIdx to be folded to a constant when VF is not "
2354              "scalable");
2355       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2356       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2357       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2358       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2359     }
2360   }
2361 }
2362 
2363 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2364   assert(V != Induction && "The new induction variable should not be used.");
2365   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2366   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2367 
2368   // If we have a stride that is replaced by one, do it here. Defer this for
2369   // the VPlan-native path until we start running Legal checks in that path.
2370   if (!EnableVPlanNativePath && Legal->hasStride(V))
2371     V = ConstantInt::get(V->getType(), 1);
2372 
2373   // If we have a vector mapped to this value, return it.
2374   if (VectorLoopValueMap.hasVectorValue(V, Part))
2375     return VectorLoopValueMap.getVectorValue(V, Part);
2376 
2377   // If the value has not been vectorized, check if it has been scalarized
2378   // instead. If it has been scalarized, and we actually need the value in
2379   // vector form, we will construct the vector values on demand.
2380   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2381     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2382 
2383     // If we've scalarized a value, that value should be an instruction.
2384     auto *I = cast<Instruction>(V);
2385 
2386     // If we aren't vectorizing, we can just copy the scalar map values over to
2387     // the vector map.
2388     if (VF.isScalar()) {
2389       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2390       return ScalarValue;
2391     }
2392 
2393     // Get the last scalar instruction we generated for V and Part. If the value
2394     // is known to be uniform after vectorization, this corresponds to lane zero
2395     // of the Part unroll iteration. Otherwise, the last instruction is the one
2396     // we created for the last vector lane of the Part unroll iteration.
2397     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2398                             ? 0
2399                             : VF.getKnownMinValue() - 1;
2400     assert((!VF.isScalable() || LastLane == 0) &&
2401            "Scalable vectorization can't lead to any scalarized values.");
2402     auto *LastInst = cast<Instruction>(
2403         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2404 
2405     // Set the insert point after the last scalarized instruction. This ensures
2406     // the insertelement sequence will directly follow the scalar definitions.
2407     auto OldIP = Builder.saveIP();
2408     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2409     Builder.SetInsertPoint(&*NewIP);
2410 
2411     // However, if we are vectorizing, we need to construct the vector values.
2412     // If the value is known to be uniform after vectorization, we can just
2413     // broadcast the scalar value corresponding to lane zero for each unroll
2414     // iteration. Otherwise, we construct the vector values using insertelement
2415     // instructions. Since the resulting vectors are stored in
2416     // VectorLoopValueMap, we will only generate the insertelements once.
2417     Value *VectorValue = nullptr;
2418     if (Cost->isUniformAfterVectorization(I, VF)) {
2419       VectorValue = getBroadcastInstrs(ScalarValue);
2420       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2421     } else {
2422       // Initialize packing with insertelements to start from poison.
2423       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2424       Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF));
2425       VectorLoopValueMap.setVectorValue(V, Part, Poison);
2426       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2427         packScalarIntoVectorValue(V, {Part, Lane});
2428       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2429     }
2430     Builder.restoreIP(OldIP);
2431     return VectorValue;
2432   }
2433 
2434   // If this scalar is unknown, assume that it is a constant or that it is
2435   // loop invariant. Broadcast V and save the value for future uses.
2436   Value *B = getBroadcastInstrs(V);
2437   VectorLoopValueMap.setVectorValue(V, Part, B);
2438   return B;
2439 }
2440 
2441 Value *
2442 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2443                                             const VPIteration &Instance) {
2444   // If the value is not an instruction contained in the loop, it should
2445   // already be scalar.
2446   if (OrigLoop->isLoopInvariant(V))
2447     return V;
2448 
2449   assert(Instance.Lane > 0
2450              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2451              : true && "Uniform values only have lane zero");
2452 
2453   // If the value from the original loop has not been vectorized, it is
2454   // represented by UF x VF scalar values in the new loop. Return the requested
2455   // scalar value.
2456   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2457     return VectorLoopValueMap.getScalarValue(V, Instance);
2458 
2459   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2460   // for the given unroll part. If this entry is not a vector type (i.e., the
2461   // vectorization factor is one), there is no need to generate an
2462   // extractelement instruction.
2463   auto *U = getOrCreateVectorValue(V, Instance.Part);
2464   if (!U->getType()->isVectorTy()) {
2465     assert(VF.isScalar() && "Value not scalarized has non-vector type");
2466     return U;
2467   }
2468 
2469   // Otherwise, the value from the original loop has been vectorized and is
2470   // represented by UF vector values. Extract and return the requested scalar
2471   // value from the appropriate vector lane.
2472   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2473 }
2474 
2475 void InnerLoopVectorizer::packScalarIntoVectorValue(
2476     Value *V, const VPIteration &Instance) {
2477   assert(V != Induction && "The new induction variable should not be used.");
2478   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2479   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2480 
2481   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2482   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2483   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2484                                             Builder.getInt32(Instance.Lane));
2485   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2486 }
2487 
2488 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2489   assert(Vec->getType()->isVectorTy() && "Invalid type");
2490   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2491   SmallVector<int, 8> ShuffleMask;
2492   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2493     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2494 
2495   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2496 }
2497 
2498 // Return whether we allow using masked interleave-groups (for dealing with
2499 // strided loads/stores that reside in predicated blocks, or for dealing
2500 // with gaps).
2501 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2502   // If an override option has been passed in for interleaved accesses, use it.
2503   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2504     return EnableMaskedInterleavedMemAccesses;
2505 
2506   return TTI.enableMaskedInterleavedAccessVectorization();
2507 }
2508 
2509 // Try to vectorize the interleave group that \p Instr belongs to.
2510 //
2511 // E.g. Translate following interleaved load group (factor = 3):
2512 //   for (i = 0; i < N; i+=3) {
2513 //     R = Pic[i];             // Member of index 0
2514 //     G = Pic[i+1];           // Member of index 1
2515 //     B = Pic[i+2];           // Member of index 2
2516 //     ... // do something to R, G, B
2517 //   }
2518 // To:
2519 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2520 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2521 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2522 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2523 //
2524 // Or translate following interleaved store group (factor = 3):
2525 //   for (i = 0; i < N; i+=3) {
2526 //     ... do something to R, G, B
2527 //     Pic[i]   = R;           // Member of index 0
2528 //     Pic[i+1] = G;           // Member of index 1
2529 //     Pic[i+2] = B;           // Member of index 2
2530 //   }
2531 // To:
2532 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2533 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2534 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2535 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2536 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2537 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2538     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2539     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2540     VPValue *BlockInMask) {
2541   Instruction *Instr = Group->getInsertPos();
2542   const DataLayout &DL = Instr->getModule()->getDataLayout();
2543 
2544   // Prepare for the vector type of the interleaved load/store.
2545   Type *ScalarTy = getMemInstValueType(Instr);
2546   unsigned InterleaveFactor = Group->getFactor();
2547   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2548   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2549 
2550   // Prepare for the new pointers.
2551   SmallVector<Value *, 2> AddrParts;
2552   unsigned Index = Group->getIndex(Instr);
2553 
2554   // TODO: extend the masked interleaved-group support to reversed access.
2555   assert((!BlockInMask || !Group->isReverse()) &&
2556          "Reversed masked interleave-group not supported.");
2557 
2558   // If the group is reverse, adjust the index to refer to the last vector lane
2559   // instead of the first. We adjust the index from the first vector lane,
2560   // rather than directly getting the pointer for lane VF - 1, because the
2561   // pointer operand of the interleaved access is supposed to be uniform. For
2562   // uniform instructions, we're only required to generate a value for the
2563   // first vector lane in each unroll iteration.
2564   assert(!VF.isScalable() &&
2565          "scalable vector reverse operation is not implemented");
2566   if (Group->isReverse())
2567     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2568 
2569   for (unsigned Part = 0; Part < UF; Part++) {
2570     Value *AddrPart = State.get(Addr, {Part, 0});
2571     setDebugLocFromInst(Builder, AddrPart);
2572 
2573     // Notice current instruction could be any index. Need to adjust the address
2574     // to the member of index 0.
2575     //
2576     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2577     //       b = A[i];       // Member of index 0
2578     // Current pointer is pointed to A[i+1], adjust it to A[i].
2579     //
2580     // E.g.  A[i+1] = a;     // Member of index 1
2581     //       A[i]   = b;     // Member of index 0
2582     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2583     // Current pointer is pointed to A[i+2], adjust it to A[i].
2584 
2585     bool InBounds = false;
2586     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2587       InBounds = gep->isInBounds();
2588     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2589     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2590 
2591     // Cast to the vector pointer type.
2592     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2593     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2594     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2595   }
2596 
2597   setDebugLocFromInst(Builder, Instr);
2598   Value *PoisonVec = PoisonValue::get(VecTy);
2599 
2600   Value *MaskForGaps = nullptr;
2601   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2602     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2603     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2604     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2605   }
2606 
2607   // Vectorize the interleaved load group.
2608   if (isa<LoadInst>(Instr)) {
2609     // For each unroll part, create a wide load for the group.
2610     SmallVector<Value *, 2> NewLoads;
2611     for (unsigned Part = 0; Part < UF; Part++) {
2612       Instruction *NewLoad;
2613       if (BlockInMask || MaskForGaps) {
2614         assert(useMaskedInterleavedAccesses(*TTI) &&
2615                "masked interleaved groups are not allowed.");
2616         Value *GroupMask = MaskForGaps;
2617         if (BlockInMask) {
2618           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2619           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2620           Value *ShuffledMask = Builder.CreateShuffleVector(
2621               BlockInMaskPart,
2622               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2623               "interleaved.mask");
2624           GroupMask = MaskForGaps
2625                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2626                                                 MaskForGaps)
2627                           : ShuffledMask;
2628         }
2629         NewLoad =
2630             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2631                                      GroupMask, PoisonVec, "wide.masked.vec");
2632       }
2633       else
2634         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2635                                             Group->getAlign(), "wide.vec");
2636       Group->addMetadata(NewLoad);
2637       NewLoads.push_back(NewLoad);
2638     }
2639 
2640     // For each member in the group, shuffle out the appropriate data from the
2641     // wide loads.
2642     unsigned J = 0;
2643     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2644       Instruction *Member = Group->getMember(I);
2645 
2646       // Skip the gaps in the group.
2647       if (!Member)
2648         continue;
2649 
2650       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2651       auto StrideMask =
2652           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2653       for (unsigned Part = 0; Part < UF; Part++) {
2654         Value *StridedVec = Builder.CreateShuffleVector(
2655             NewLoads[Part], StrideMask, "strided.vec");
2656 
2657         // If this member has different type, cast the result type.
2658         if (Member->getType() != ScalarTy) {
2659           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2660           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2661           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2662         }
2663 
2664         if (Group->isReverse())
2665           StridedVec = reverseVector(StridedVec);
2666 
2667         State.set(VPDefs[J], Member, StridedVec, Part);
2668       }
2669       ++J;
2670     }
2671     return;
2672   }
2673 
2674   // The sub vector type for current instruction.
2675   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2676   auto *SubVT = VectorType::get(ScalarTy, VF);
2677 
2678   // Vectorize the interleaved store group.
2679   for (unsigned Part = 0; Part < UF; Part++) {
2680     // Collect the stored vector from each member.
2681     SmallVector<Value *, 4> StoredVecs;
2682     for (unsigned i = 0; i < InterleaveFactor; i++) {
2683       // Interleaved store group doesn't allow a gap, so each index has a member
2684       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2685 
2686       Value *StoredVec = State.get(StoredValues[i], Part);
2687 
2688       if (Group->isReverse())
2689         StoredVec = reverseVector(StoredVec);
2690 
2691       // If this member has different type, cast it to a unified type.
2692 
2693       if (StoredVec->getType() != SubVT)
2694         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2695 
2696       StoredVecs.push_back(StoredVec);
2697     }
2698 
2699     // Concatenate all vectors into a wide vector.
2700     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2701 
2702     // Interleave the elements in the wide vector.
2703     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2704     Value *IVec = Builder.CreateShuffleVector(
2705         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2706         "interleaved.vec");
2707 
2708     Instruction *NewStoreInstr;
2709     if (BlockInMask) {
2710       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2711       Value *ShuffledMask = Builder.CreateShuffleVector(
2712           BlockInMaskPart,
2713           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2714           "interleaved.mask");
2715       NewStoreInstr = Builder.CreateMaskedStore(
2716           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2717     }
2718     else
2719       NewStoreInstr =
2720           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2721 
2722     Group->addMetadata(NewStoreInstr);
2723   }
2724 }
2725 
2726 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2727     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2728     VPValue *StoredValue, VPValue *BlockInMask) {
2729   // Attempt to issue a wide load.
2730   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2731   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2732 
2733   assert((LI || SI) && "Invalid Load/Store instruction");
2734   assert((!SI || StoredValue) && "No stored value provided for widened store");
2735   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2736 
2737   LoopVectorizationCostModel::InstWidening Decision =
2738       Cost->getWideningDecision(Instr, VF);
2739   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2740           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2741           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2742          "CM decision is not to widen the memory instruction");
2743 
2744   Type *ScalarDataTy = getMemInstValueType(Instr);
2745 
2746   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2747   const Align Alignment = getLoadStoreAlignment(Instr);
2748 
2749   // Determine if the pointer operand of the access is either consecutive or
2750   // reverse consecutive.
2751   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2752   bool ConsecutiveStride =
2753       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2754   bool CreateGatherScatter =
2755       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2756 
2757   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2758   // gather/scatter. Otherwise Decision should have been to Scalarize.
2759   assert((ConsecutiveStride || CreateGatherScatter) &&
2760          "The instruction should be scalarized");
2761   (void)ConsecutiveStride;
2762 
2763   VectorParts BlockInMaskParts(UF);
2764   bool isMaskRequired = BlockInMask;
2765   if (isMaskRequired)
2766     for (unsigned Part = 0; Part < UF; ++Part)
2767       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2768 
2769   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2770     // Calculate the pointer for the specific unroll-part.
2771     GetElementPtrInst *PartPtr = nullptr;
2772 
2773     bool InBounds = false;
2774     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2775       InBounds = gep->isInBounds();
2776 
2777     if (Reverse) {
2778       assert(!VF.isScalable() &&
2779              "Reversing vectors is not yet supported for scalable vectors.");
2780 
2781       // If the address is consecutive but reversed, then the
2782       // wide store needs to start at the last vector element.
2783       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2784           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2785       PartPtr->setIsInBounds(InBounds);
2786       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2787           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2788       PartPtr->setIsInBounds(InBounds);
2789       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2790         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2791     } else {
2792       Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2793       PartPtr = cast<GetElementPtrInst>(
2794           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2795       PartPtr->setIsInBounds(InBounds);
2796     }
2797 
2798     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2799     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2800   };
2801 
2802   // Handle Stores:
2803   if (SI) {
2804     setDebugLocFromInst(Builder, SI);
2805 
2806     for (unsigned Part = 0; Part < UF; ++Part) {
2807       Instruction *NewSI = nullptr;
2808       Value *StoredVal = State.get(StoredValue, Part);
2809       if (CreateGatherScatter) {
2810         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2811         Value *VectorGep = State.get(Addr, Part);
2812         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2813                                             MaskPart);
2814       } else {
2815         if (Reverse) {
2816           // If we store to reverse consecutive memory locations, then we need
2817           // to reverse the order of elements in the stored value.
2818           StoredVal = reverseVector(StoredVal);
2819           // We don't want to update the value in the map as it might be used in
2820           // another expression. So don't call resetVectorValue(StoredVal).
2821         }
2822         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2823         if (isMaskRequired)
2824           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2825                                             BlockInMaskParts[Part]);
2826         else
2827           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2828       }
2829       addMetadata(NewSI, SI);
2830     }
2831     return;
2832   }
2833 
2834   // Handle loads.
2835   assert(LI && "Must have a load instruction");
2836   setDebugLocFromInst(Builder, LI);
2837   for (unsigned Part = 0; Part < UF; ++Part) {
2838     Value *NewLI;
2839     if (CreateGatherScatter) {
2840       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2841       Value *VectorGep = State.get(Addr, Part);
2842       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2843                                          nullptr, "wide.masked.gather");
2844       addMetadata(NewLI, LI);
2845     } else {
2846       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2847       if (isMaskRequired)
2848         NewLI = Builder.CreateMaskedLoad(
2849             VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
2850             "wide.masked.load");
2851       else
2852         NewLI =
2853             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2854 
2855       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2856       addMetadata(NewLI, LI);
2857       if (Reverse)
2858         NewLI = reverseVector(NewLI);
2859     }
2860 
2861     State.set(Def, Instr, NewLI, Part);
2862   }
2863 }
2864 
2865 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2866                                                const VPIteration &Instance,
2867                                                bool IfPredicateInstr,
2868                                                VPTransformState &State) {
2869   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2870 
2871   setDebugLocFromInst(Builder, Instr);
2872 
2873   // Does this instruction return a value ?
2874   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2875 
2876   Instruction *Cloned = Instr->clone();
2877   if (!IsVoidRetTy)
2878     Cloned->setName(Instr->getName() + ".cloned");
2879 
2880   // Replace the operands of the cloned instructions with their scalar
2881   // equivalents in the new loop.
2882   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2883     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2884     auto InputInstance = Instance;
2885     if (!Operand || !OrigLoop->contains(Operand) ||
2886         (Cost->isUniformAfterVectorization(Operand, State.VF)))
2887       InputInstance.Lane = 0;
2888     auto *NewOp = State.get(User.getOperand(op), InputInstance);
2889     Cloned->setOperand(op, NewOp);
2890   }
2891   addNewMetadata(Cloned, Instr);
2892 
2893   // Place the cloned scalar in the new loop.
2894   Builder.Insert(Cloned);
2895 
2896   // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
2897   // representing scalar values in VPTransformState. Add the cloned scalar to
2898   // the scalar map entry.
2899   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2900 
2901   // If we just cloned a new assumption, add it the assumption cache.
2902   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2903     if (II->getIntrinsicID() == Intrinsic::assume)
2904       AC->registerAssumption(II);
2905 
2906   // End if-block.
2907   if (IfPredicateInstr)
2908     PredicatedInstructions.push_back(Cloned);
2909 }
2910 
2911 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2912                                                       Value *End, Value *Step,
2913                                                       Instruction *DL) {
2914   BasicBlock *Header = L->getHeader();
2915   BasicBlock *Latch = L->getLoopLatch();
2916   // As we're just creating this loop, it's possible no latch exists
2917   // yet. If so, use the header as this will be a single block loop.
2918   if (!Latch)
2919     Latch = Header;
2920 
2921   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2922   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2923   setDebugLocFromInst(Builder, OldInst);
2924   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2925 
2926   Builder.SetInsertPoint(Latch->getTerminator());
2927   setDebugLocFromInst(Builder, OldInst);
2928 
2929   // Create i+1 and fill the PHINode.
2930   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2931   Induction->addIncoming(Start, L->getLoopPreheader());
2932   Induction->addIncoming(Next, Latch);
2933   // Create the compare.
2934   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2935   Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
2936 
2937   // Now we have two terminators. Remove the old one from the block.
2938   Latch->getTerminator()->eraseFromParent();
2939 
2940   return Induction;
2941 }
2942 
2943 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2944   if (TripCount)
2945     return TripCount;
2946 
2947   assert(L && "Create Trip Count for null loop.");
2948   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2949   // Find the loop boundaries.
2950   ScalarEvolution *SE = PSE.getSE();
2951   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2952   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2953          "Invalid loop count");
2954 
2955   Type *IdxTy = Legal->getWidestInductionType();
2956   assert(IdxTy && "No type for induction");
2957 
2958   // The exit count might have the type of i64 while the phi is i32. This can
2959   // happen if we have an induction variable that is sign extended before the
2960   // compare. The only way that we get a backedge taken count is that the
2961   // induction variable was signed and as such will not overflow. In such a case
2962   // truncation is legal.
2963   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2964       IdxTy->getPrimitiveSizeInBits())
2965     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2966   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2967 
2968   // Get the total trip count from the count by adding 1.
2969   const SCEV *ExitCount = SE->getAddExpr(
2970       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2971 
2972   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2973 
2974   // Expand the trip count and place the new instructions in the preheader.
2975   // Notice that the pre-header does not change, only the loop body.
2976   SCEVExpander Exp(*SE, DL, "induction");
2977 
2978   // Count holds the overall loop count (N).
2979   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2980                                 L->getLoopPreheader()->getTerminator());
2981 
2982   if (TripCount->getType()->isPointerTy())
2983     TripCount =
2984         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2985                                     L->getLoopPreheader()->getTerminator());
2986 
2987   return TripCount;
2988 }
2989 
2990 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2991   if (VectorTripCount)
2992     return VectorTripCount;
2993 
2994   Value *TC = getOrCreateTripCount(L);
2995   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2996 
2997   Type *Ty = TC->getType();
2998   // This is where we can make the step a runtime constant.
2999   Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
3000 
3001   // If the tail is to be folded by masking, round the number of iterations N
3002   // up to a multiple of Step instead of rounding down. This is done by first
3003   // adding Step-1 and then rounding down. Note that it's ok if this addition
3004   // overflows: the vector induction variable will eventually wrap to zero given
3005   // that it starts at zero and its Step is a power of two; the loop will then
3006   // exit, with the last early-exit vector comparison also producing all-true.
3007   if (Cost->foldTailByMasking()) {
3008     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3009            "VF*UF must be a power of 2 when folding tail by masking");
3010     assert(!VF.isScalable() &&
3011            "Tail folding not yet supported for scalable vectors");
3012     TC = Builder.CreateAdd(
3013         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3014   }
3015 
3016   // Now we need to generate the expression for the part of the loop that the
3017   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3018   // iterations are not required for correctness, or N - Step, otherwise. Step
3019   // is equal to the vectorization factor (number of SIMD elements) times the
3020   // unroll factor (number of SIMD instructions).
3021   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3022 
3023   // There are two cases where we need to ensure (at least) the last iteration
3024   // runs in the scalar remainder loop. Thus, if the step evenly divides
3025   // the trip count, we set the remainder to be equal to the step. If the step
3026   // does not evenly divide the trip count, no adjustment is necessary since
3027   // there will already be scalar iterations. Note that the minimum iterations
3028   // check ensures that N >= Step. The cases are:
3029   // 1) If there is a non-reversed interleaved group that may speculatively
3030   //    access memory out-of-bounds.
3031   // 2) If any instruction may follow a conditionally taken exit. That is, if
3032   //    the loop contains multiple exiting blocks, or a single exiting block
3033   //    which is not the latch.
3034   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3035     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3036     R = Builder.CreateSelect(IsZero, Step, R);
3037   }
3038 
3039   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3040 
3041   return VectorTripCount;
3042 }
3043 
3044 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3045                                                    const DataLayout &DL) {
3046   // Verify that V is a vector type with same number of elements as DstVTy.
3047   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3048   unsigned VF = DstFVTy->getNumElements();
3049   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3050   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3051   Type *SrcElemTy = SrcVecTy->getElementType();
3052   Type *DstElemTy = DstFVTy->getElementType();
3053   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3054          "Vector elements must have same size");
3055 
3056   // Do a direct cast if element types are castable.
3057   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3058     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3059   }
3060   // V cannot be directly casted to desired vector type.
3061   // May happen when V is a floating point vector but DstVTy is a vector of
3062   // pointers or vice-versa. Handle this using a two-step bitcast using an
3063   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3064   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3065          "Only one type should be a pointer type");
3066   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3067          "Only one type should be a floating point type");
3068   Type *IntTy =
3069       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3070   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3071   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3072   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3073 }
3074 
3075 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3076                                                          BasicBlock *Bypass) {
3077   Value *Count = getOrCreateTripCount(L);
3078   // Reuse existing vector loop preheader for TC checks.
3079   // Note that new preheader block is generated for vector loop.
3080   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3081   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3082 
3083   // Generate code to check if the loop's trip count is less than VF * UF, or
3084   // equal to it in case a scalar epilogue is required; this implies that the
3085   // vector trip count is zero. This check also covers the case where adding one
3086   // to the backedge-taken count overflowed leading to an incorrect trip count
3087   // of zero. In this case we will also jump to the scalar loop.
3088   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3089                                           : ICmpInst::ICMP_ULT;
3090 
3091   // If tail is to be folded, vector loop takes care of all iterations.
3092   Value *CheckMinIters = Builder.getFalse();
3093   if (!Cost->foldTailByMasking()) {
3094     Value *Step =
3095         createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3096     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3097   }
3098   // Create new preheader for vector loop.
3099   LoopVectorPreHeader =
3100       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3101                  "vector.ph");
3102 
3103   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3104                                DT->getNode(Bypass)->getIDom()) &&
3105          "TC check is expected to dominate Bypass");
3106 
3107   // Update dominator for Bypass & LoopExit.
3108   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3109   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3110 
3111   ReplaceInstWithInst(
3112       TCCheckBlock->getTerminator(),
3113       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3114   LoopBypassBlocks.push_back(TCCheckBlock);
3115 }
3116 
3117 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3118   // Reuse existing vector loop preheader for SCEV checks.
3119   // Note that new preheader block is generated for vector loop.
3120   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
3121 
3122   // Generate the code to check that the SCEV assumptions that we made.
3123   // We want the new basic block to start at the first instruction in a
3124   // sequence of instructions that form a check.
3125   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3126                    "scev.check");
3127   Value *SCEVCheck = Exp.expandCodeForPredicate(
3128       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
3129 
3130   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3131     if (C->isZero())
3132       return;
3133 
3134   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3135            (OptForSizeBasedOnProfile &&
3136             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3137          "Cannot SCEV check stride or overflow when optimizing for size");
3138 
3139   SCEVCheckBlock->setName("vector.scevcheck");
3140   // Create new preheader for vector loop.
3141   LoopVectorPreHeader =
3142       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
3143                  nullptr, "vector.ph");
3144 
3145   // Update dominator only if this is first RT check.
3146   if (LoopBypassBlocks.empty()) {
3147     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3148     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3149   }
3150 
3151   ReplaceInstWithInst(
3152       SCEVCheckBlock->getTerminator(),
3153       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
3154   LoopBypassBlocks.push_back(SCEVCheckBlock);
3155   AddedSafetyChecks = true;
3156 }
3157 
3158 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3159   // VPlan-native path does not do any analysis for runtime checks currently.
3160   if (EnableVPlanNativePath)
3161     return;
3162 
3163   // Reuse existing vector loop preheader for runtime memory checks.
3164   // Note that new preheader block is generated for vector loop.
3165   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
3166 
3167   // Generate the code that checks in runtime if arrays overlap. We put the
3168   // checks into a separate block to make the more common case of few elements
3169   // faster.
3170   auto *LAI = Legal->getLAI();
3171   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
3172   if (!RtPtrChecking.Need)
3173     return;
3174 
3175   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3176     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3177            "Cannot emit memory checks when optimizing for size, unless forced "
3178            "to vectorize.");
3179     ORE->emit([&]() {
3180       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3181                                         L->getStartLoc(), L->getHeader())
3182              << "Code-size may be reduced by not forcing "
3183                 "vectorization, or by source-code modifications "
3184                 "eliminating the need for runtime checks "
3185                 "(e.g., adding 'restrict').";
3186     });
3187   }
3188 
3189   MemCheckBlock->setName("vector.memcheck");
3190   // Create new preheader for vector loop.
3191   LoopVectorPreHeader =
3192       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
3193                  "vector.ph");
3194 
3195   auto *CondBranch = cast<BranchInst>(
3196       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
3197   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
3198   LoopBypassBlocks.push_back(MemCheckBlock);
3199   AddedSafetyChecks = true;
3200 
3201   // Update dominator only if this is first RT check.
3202   if (LoopBypassBlocks.empty()) {
3203     DT->changeImmediateDominator(Bypass, MemCheckBlock);
3204     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
3205   }
3206 
3207   Instruction *FirstCheckInst;
3208   Instruction *MemRuntimeCheck;
3209   std::tie(FirstCheckInst, MemRuntimeCheck) =
3210       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
3211                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
3212   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
3213                             "claimed checks are required");
3214   CondBranch->setCondition(MemRuntimeCheck);
3215 
3216   // We currently don't use LoopVersioning for the actual loop cloning but we
3217   // still use it to add the noalias metadata.
3218   LVer = std::make_unique<LoopVersioning>(
3219       *Legal->getLAI(),
3220       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3221       DT, PSE.getSE());
3222   LVer->prepareNoAliasMetadata();
3223 }
3224 
3225 Value *InnerLoopVectorizer::emitTransformedIndex(
3226     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3227     const InductionDescriptor &ID) const {
3228 
3229   SCEVExpander Exp(*SE, DL, "induction");
3230   auto Step = ID.getStep();
3231   auto StartValue = ID.getStartValue();
3232   assert(Index->getType() == Step->getType() &&
3233          "Index type does not match StepValue type");
3234 
3235   // Note: the IR at this point is broken. We cannot use SE to create any new
3236   // SCEV and then expand it, hoping that SCEV's simplification will give us
3237   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3238   // lead to various SCEV crashes. So all we can do is to use builder and rely
3239   // on InstCombine for future simplifications. Here we handle some trivial
3240   // cases only.
3241   auto CreateAdd = [&B](Value *X, Value *Y) {
3242     assert(X->getType() == Y->getType() && "Types don't match!");
3243     if (auto *CX = dyn_cast<ConstantInt>(X))
3244       if (CX->isZero())
3245         return Y;
3246     if (auto *CY = dyn_cast<ConstantInt>(Y))
3247       if (CY->isZero())
3248         return X;
3249     return B.CreateAdd(X, Y);
3250   };
3251 
3252   auto CreateMul = [&B](Value *X, Value *Y) {
3253     assert(X->getType() == Y->getType() && "Types don't match!");
3254     if (auto *CX = dyn_cast<ConstantInt>(X))
3255       if (CX->isOne())
3256         return Y;
3257     if (auto *CY = dyn_cast<ConstantInt>(Y))
3258       if (CY->isOne())
3259         return X;
3260     return B.CreateMul(X, Y);
3261   };
3262 
3263   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3264   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3265   // the DomTree is not kept up-to-date for additional blocks generated in the
3266   // vector loop. By using the header as insertion point, we guarantee that the
3267   // expanded instructions dominate all their uses.
3268   auto GetInsertPoint = [this, &B]() {
3269     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3270     if (InsertBB != LoopVectorBody &&
3271         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3272       return LoopVectorBody->getTerminator();
3273     return &*B.GetInsertPoint();
3274   };
3275   switch (ID.getKind()) {
3276   case InductionDescriptor::IK_IntInduction: {
3277     assert(Index->getType() == StartValue->getType() &&
3278            "Index type does not match StartValue type");
3279     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3280       return B.CreateSub(StartValue, Index);
3281     auto *Offset = CreateMul(
3282         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3283     return CreateAdd(StartValue, Offset);
3284   }
3285   case InductionDescriptor::IK_PtrInduction: {
3286     assert(isa<SCEVConstant>(Step) &&
3287            "Expected constant step for pointer induction");
3288     return B.CreateGEP(
3289         StartValue->getType()->getPointerElementType(), StartValue,
3290         CreateMul(Index,
3291                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3292   }
3293   case InductionDescriptor::IK_FpInduction: {
3294     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3295     auto InductionBinOp = ID.getInductionBinOp();
3296     assert(InductionBinOp &&
3297            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3298             InductionBinOp->getOpcode() == Instruction::FSub) &&
3299            "Original bin op should be defined for FP induction");
3300 
3301     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3302 
3303     // Floating point operations had to be 'fast' to enable the induction.
3304     FastMathFlags Flags;
3305     Flags.setFast();
3306 
3307     Value *MulExp = B.CreateFMul(StepValue, Index);
3308     if (isa<Instruction>(MulExp))
3309       // We have to check, the MulExp may be a constant.
3310       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3311 
3312     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3313                                "induction");
3314     if (isa<Instruction>(BOp))
3315       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3316 
3317     return BOp;
3318   }
3319   case InductionDescriptor::IK_NoInduction:
3320     return nullptr;
3321   }
3322   llvm_unreachable("invalid enum");
3323 }
3324 
3325 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3326   LoopScalarBody = OrigLoop->getHeader();
3327   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3328   LoopExitBlock = OrigLoop->getUniqueExitBlock();
3329   assert(LoopExitBlock && "Must have an exit block");
3330   assert(LoopVectorPreHeader && "Invalid loop structure");
3331 
3332   LoopMiddleBlock =
3333       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3334                  LI, nullptr, Twine(Prefix) + "middle.block");
3335   LoopScalarPreHeader =
3336       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3337                  nullptr, Twine(Prefix) + "scalar.ph");
3338 
3339   // Set up branch from middle block to the exit and scalar preheader blocks.
3340   // completeLoopSkeleton will update the condition to use an iteration check,
3341   // if required to decide whether to execute the remainder.
3342   BranchInst *BrInst =
3343       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
3344   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3345   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3346   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3347 
3348   // We intentionally don't let SplitBlock to update LoopInfo since
3349   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3350   // LoopVectorBody is explicitly added to the correct place few lines later.
3351   LoopVectorBody =
3352       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3353                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3354 
3355   // Update dominator for loop exit.
3356   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3357 
3358   // Create and register the new vector loop.
3359   Loop *Lp = LI->AllocateLoop();
3360   Loop *ParentLoop = OrigLoop->getParentLoop();
3361 
3362   // Insert the new loop into the loop nest and register the new basic blocks
3363   // before calling any utilities such as SCEV that require valid LoopInfo.
3364   if (ParentLoop) {
3365     ParentLoop->addChildLoop(Lp);
3366   } else {
3367     LI->addTopLevelLoop(Lp);
3368   }
3369   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3370   return Lp;
3371 }
3372 
3373 void InnerLoopVectorizer::createInductionResumeValues(
3374     Loop *L, Value *VectorTripCount,
3375     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3376   assert(VectorTripCount && L && "Expected valid arguments");
3377   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3378           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3379          "Inconsistent information about additional bypass.");
3380   // We are going to resume the execution of the scalar loop.
3381   // Go over all of the induction variables that we found and fix the
3382   // PHIs that are left in the scalar version of the loop.
3383   // The starting values of PHI nodes depend on the counter of the last
3384   // iteration in the vectorized loop.
3385   // If we come from a bypass edge then we need to start from the original
3386   // start value.
3387   for (auto &InductionEntry : Legal->getInductionVars()) {
3388     PHINode *OrigPhi = InductionEntry.first;
3389     InductionDescriptor II = InductionEntry.second;
3390 
3391     // Create phi nodes to merge from the  backedge-taken check block.
3392     PHINode *BCResumeVal =
3393         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3394                         LoopScalarPreHeader->getTerminator());
3395     // Copy original phi DL over to the new one.
3396     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3397     Value *&EndValue = IVEndValues[OrigPhi];
3398     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3399     if (OrigPhi == OldInduction) {
3400       // We know what the end value is.
3401       EndValue = VectorTripCount;
3402     } else {
3403       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3404       Type *StepType = II.getStep()->getType();
3405       Instruction::CastOps CastOp =
3406           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3407       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3408       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3409       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3410       EndValue->setName("ind.end");
3411 
3412       // Compute the end value for the additional bypass (if applicable).
3413       if (AdditionalBypass.first) {
3414         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3415         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3416                                          StepType, true);
3417         CRD =
3418             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3419         EndValueFromAdditionalBypass =
3420             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3421         EndValueFromAdditionalBypass->setName("ind.end");
3422       }
3423     }
3424     // The new PHI merges the original incoming value, in case of a bypass,
3425     // or the value at the end of the vectorized loop.
3426     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3427 
3428     // Fix the scalar body counter (PHI node).
3429     // The old induction's phi node in the scalar body needs the truncated
3430     // value.
3431     for (BasicBlock *BB : LoopBypassBlocks)
3432       BCResumeVal->addIncoming(II.getStartValue(), BB);
3433 
3434     if (AdditionalBypass.first)
3435       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3436                                             EndValueFromAdditionalBypass);
3437 
3438     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3439   }
3440 }
3441 
3442 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3443                                                       MDNode *OrigLoopID) {
3444   assert(L && "Expected valid loop.");
3445 
3446   // The trip counts should be cached by now.
3447   Value *Count = getOrCreateTripCount(L);
3448   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3449 
3450   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3451 
3452   // Add a check in the middle block to see if we have completed
3453   // all of the iterations in the first vector loop.
3454   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3455   // If tail is to be folded, we know we don't need to run the remainder.
3456   if (!Cost->foldTailByMasking()) {
3457     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3458                                         Count, VectorTripCount, "cmp.n",
3459                                         LoopMiddleBlock->getTerminator());
3460 
3461     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3462     // of the corresponding compare because they may have ended up with
3463     // different line numbers and we want to avoid awkward line stepping while
3464     // debugging. Eg. if the compare has got a line number inside the loop.
3465     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3466     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3467   }
3468 
3469   // Get ready to start creating new instructions into the vectorized body.
3470   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3471          "Inconsistent vector loop preheader");
3472   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3473 
3474   Optional<MDNode *> VectorizedLoopID =
3475       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3476                                       LLVMLoopVectorizeFollowupVectorized});
3477   if (VectorizedLoopID.hasValue()) {
3478     L->setLoopID(VectorizedLoopID.getValue());
3479 
3480     // Do not setAlreadyVectorized if loop attributes have been defined
3481     // explicitly.
3482     return LoopVectorPreHeader;
3483   }
3484 
3485   // Keep all loop hints from the original loop on the vector loop (we'll
3486   // replace the vectorizer-specific hints below).
3487   if (MDNode *LID = OrigLoop->getLoopID())
3488     L->setLoopID(LID);
3489 
3490   LoopVectorizeHints Hints(L, true, *ORE);
3491   Hints.setAlreadyVectorized();
3492 
3493 #ifdef EXPENSIVE_CHECKS
3494   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3495   LI->verify(*DT);
3496 #endif
3497 
3498   return LoopVectorPreHeader;
3499 }
3500 
3501 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3502   /*
3503    In this function we generate a new loop. The new loop will contain
3504    the vectorized instructions while the old loop will continue to run the
3505    scalar remainder.
3506 
3507        [ ] <-- loop iteration number check.
3508     /   |
3509    /    v
3510   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3511   |  /  |
3512   | /   v
3513   ||   [ ]     <-- vector pre header.
3514   |/    |
3515   |     v
3516   |    [  ] \
3517   |    [  ]_|   <-- vector loop.
3518   |     |
3519   |     v
3520   |   -[ ]   <--- middle-block.
3521   |  /  |
3522   | /   v
3523   -|- >[ ]     <--- new preheader.
3524    |    |
3525    |    v
3526    |   [ ] \
3527    |   [ ]_|   <-- old scalar loop to handle remainder.
3528     \   |
3529      \  v
3530       >[ ]     <-- exit block.
3531    ...
3532    */
3533 
3534   // Get the metadata of the original loop before it gets modified.
3535   MDNode *OrigLoopID = OrigLoop->getLoopID();
3536 
3537   // Create an empty vector loop, and prepare basic blocks for the runtime
3538   // checks.
3539   Loop *Lp = createVectorLoopSkeleton("");
3540 
3541   // Now, compare the new count to zero. If it is zero skip the vector loop and
3542   // jump to the scalar loop. This check also covers the case where the
3543   // backedge-taken count is uint##_max: adding one to it will overflow leading
3544   // to an incorrect trip count of zero. In this (rare) case we will also jump
3545   // to the scalar loop.
3546   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3547 
3548   // Generate the code to check any assumptions that we've made for SCEV
3549   // expressions.
3550   emitSCEVChecks(Lp, LoopScalarPreHeader);
3551 
3552   // Generate the code that checks in runtime if arrays overlap. We put the
3553   // checks into a separate block to make the more common case of few elements
3554   // faster.
3555   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3556 
3557   // Some loops have a single integer induction variable, while other loops
3558   // don't. One example is c++ iterators that often have multiple pointer
3559   // induction variables. In the code below we also support a case where we
3560   // don't have a single induction variable.
3561   //
3562   // We try to obtain an induction variable from the original loop as hard
3563   // as possible. However if we don't find one that:
3564   //   - is an integer
3565   //   - counts from zero, stepping by one
3566   //   - is the size of the widest induction variable type
3567   // then we create a new one.
3568   OldInduction = Legal->getPrimaryInduction();
3569   Type *IdxTy = Legal->getWidestInductionType();
3570   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3571   // The loop step is equal to the vectorization factor (num of SIMD elements)
3572   // times the unroll factor (num of SIMD instructions).
3573   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3574   Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3575   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3576   Induction =
3577       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3578                               getDebugLocFromInstOrOperands(OldInduction));
3579 
3580   // Emit phis for the new starting index of the scalar loop.
3581   createInductionResumeValues(Lp, CountRoundDown);
3582 
3583   return completeLoopSkeleton(Lp, OrigLoopID);
3584 }
3585 
3586 // Fix up external users of the induction variable. At this point, we are
3587 // in LCSSA form, with all external PHIs that use the IV having one input value,
3588 // coming from the remainder loop. We need those PHIs to also have a correct
3589 // value for the IV when arriving directly from the middle block.
3590 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3591                                        const InductionDescriptor &II,
3592                                        Value *CountRoundDown, Value *EndValue,
3593                                        BasicBlock *MiddleBlock) {
3594   // There are two kinds of external IV usages - those that use the value
3595   // computed in the last iteration (the PHI) and those that use the penultimate
3596   // value (the value that feeds into the phi from the loop latch).
3597   // We allow both, but they, obviously, have different values.
3598 
3599   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3600 
3601   DenseMap<Value *, Value *> MissingVals;
3602 
3603   // An external user of the last iteration's value should see the value that
3604   // the remainder loop uses to initialize its own IV.
3605   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3606   for (User *U : PostInc->users()) {
3607     Instruction *UI = cast<Instruction>(U);
3608     if (!OrigLoop->contains(UI)) {
3609       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3610       MissingVals[UI] = EndValue;
3611     }
3612   }
3613 
3614   // An external user of the penultimate value need to see EndValue - Step.
3615   // The simplest way to get this is to recompute it from the constituent SCEVs,
3616   // that is Start + (Step * (CRD - 1)).
3617   for (User *U : OrigPhi->users()) {
3618     auto *UI = cast<Instruction>(U);
3619     if (!OrigLoop->contains(UI)) {
3620       const DataLayout &DL =
3621           OrigLoop->getHeader()->getModule()->getDataLayout();
3622       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3623 
3624       IRBuilder<> B(MiddleBlock->getTerminator());
3625       Value *CountMinusOne = B.CreateSub(
3626           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3627       Value *CMO =
3628           !II.getStep()->getType()->isIntegerTy()
3629               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3630                              II.getStep()->getType())
3631               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3632       CMO->setName("cast.cmo");
3633       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3634       Escape->setName("ind.escape");
3635       MissingVals[UI] = Escape;
3636     }
3637   }
3638 
3639   for (auto &I : MissingVals) {
3640     PHINode *PHI = cast<PHINode>(I.first);
3641     // One corner case we have to handle is two IVs "chasing" each-other,
3642     // that is %IV2 = phi [...], [ %IV1, %latch ]
3643     // In this case, if IV1 has an external use, we need to avoid adding both
3644     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3645     // don't already have an incoming value for the middle block.
3646     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3647       PHI->addIncoming(I.second, MiddleBlock);
3648   }
3649 }
3650 
3651 namespace {
3652 
3653 struct CSEDenseMapInfo {
3654   static bool canHandle(const Instruction *I) {
3655     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3656            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3657   }
3658 
3659   static inline Instruction *getEmptyKey() {
3660     return DenseMapInfo<Instruction *>::getEmptyKey();
3661   }
3662 
3663   static inline Instruction *getTombstoneKey() {
3664     return DenseMapInfo<Instruction *>::getTombstoneKey();
3665   }
3666 
3667   static unsigned getHashValue(const Instruction *I) {
3668     assert(canHandle(I) && "Unknown instruction!");
3669     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3670                                                            I->value_op_end()));
3671   }
3672 
3673   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3674     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3675         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3676       return LHS == RHS;
3677     return LHS->isIdenticalTo(RHS);
3678   }
3679 };
3680 
3681 } // end anonymous namespace
3682 
3683 ///Perform cse of induction variable instructions.
3684 static void cse(BasicBlock *BB) {
3685   // Perform simple cse.
3686   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3687   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3688     Instruction *In = &*I++;
3689 
3690     if (!CSEDenseMapInfo::canHandle(In))
3691       continue;
3692 
3693     // Check if we can replace this instruction with any of the
3694     // visited instructions.
3695     if (Instruction *V = CSEMap.lookup(In)) {
3696       In->replaceAllUsesWith(V);
3697       In->eraseFromParent();
3698       continue;
3699     }
3700 
3701     CSEMap[In] = In;
3702   }
3703 }
3704 
3705 InstructionCost
3706 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3707                                               bool &NeedToScalarize) {
3708   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3709   Function *F = CI->getCalledFunction();
3710   Type *ScalarRetTy = CI->getType();
3711   SmallVector<Type *, 4> Tys, ScalarTys;
3712   for (auto &ArgOp : CI->arg_operands())
3713     ScalarTys.push_back(ArgOp->getType());
3714 
3715   // Estimate cost of scalarized vector call. The source operands are assumed
3716   // to be vectors, so we need to extract individual elements from there,
3717   // execute VF scalar calls, and then gather the result into the vector return
3718   // value.
3719   InstructionCost ScalarCallCost =
3720       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3721   if (VF.isScalar())
3722     return ScalarCallCost;
3723 
3724   // Compute corresponding vector type for return value and arguments.
3725   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3726   for (Type *ScalarTy : ScalarTys)
3727     Tys.push_back(ToVectorTy(ScalarTy, VF));
3728 
3729   // Compute costs of unpacking argument values for the scalar calls and
3730   // packing the return values to a vector.
3731   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3732 
3733   InstructionCost Cost =
3734       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3735 
3736   // If we can't emit a vector call for this function, then the currently found
3737   // cost is the cost we need to return.
3738   NeedToScalarize = true;
3739   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3740   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3741 
3742   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3743     return Cost;
3744 
3745   // If the corresponding vector cost is cheaper, return its cost.
3746   InstructionCost VectorCallCost =
3747       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3748   if (VectorCallCost < Cost) {
3749     NeedToScalarize = false;
3750     Cost = VectorCallCost;
3751   }
3752   return Cost;
3753 }
3754 
3755 InstructionCost
3756 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3757                                                    ElementCount VF) {
3758   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3759   assert(ID && "Expected intrinsic call!");
3760 
3761   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3762   return TTI.getIntrinsicInstrCost(CostAttrs,
3763                                    TargetTransformInfo::TCK_RecipThroughput);
3764 }
3765 
3766 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3767   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3768   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3769   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3770 }
3771 
3772 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3773   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3774   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3775   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3776 }
3777 
3778 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3779   // For every instruction `I` in MinBWs, truncate the operands, create a
3780   // truncated version of `I` and reextend its result. InstCombine runs
3781   // later and will remove any ext/trunc pairs.
3782   SmallPtrSet<Value *, 4> Erased;
3783   for (const auto &KV : Cost->getMinimalBitwidths()) {
3784     // If the value wasn't vectorized, we must maintain the original scalar
3785     // type. The absence of the value from VectorLoopValueMap indicates that it
3786     // wasn't vectorized.
3787     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3788       continue;
3789     for (unsigned Part = 0; Part < UF; ++Part) {
3790       Value *I = getOrCreateVectorValue(KV.first, Part);
3791       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3792         continue;
3793       Type *OriginalTy = I->getType();
3794       Type *ScalarTruncatedTy =
3795           IntegerType::get(OriginalTy->getContext(), KV.second);
3796       auto *TruncatedTy = FixedVectorType::get(
3797           ScalarTruncatedTy,
3798           cast<FixedVectorType>(OriginalTy)->getNumElements());
3799       if (TruncatedTy == OriginalTy)
3800         continue;
3801 
3802       IRBuilder<> B(cast<Instruction>(I));
3803       auto ShrinkOperand = [&](Value *V) -> Value * {
3804         if (auto *ZI = dyn_cast<ZExtInst>(V))
3805           if (ZI->getSrcTy() == TruncatedTy)
3806             return ZI->getOperand(0);
3807         return B.CreateZExtOrTrunc(V, TruncatedTy);
3808       };
3809 
3810       // The actual instruction modification depends on the instruction type,
3811       // unfortunately.
3812       Value *NewI = nullptr;
3813       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3814         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3815                              ShrinkOperand(BO->getOperand(1)));
3816 
3817         // Any wrapping introduced by shrinking this operation shouldn't be
3818         // considered undefined behavior. So, we can't unconditionally copy
3819         // arithmetic wrapping flags to NewI.
3820         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3821       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3822         NewI =
3823             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3824                          ShrinkOperand(CI->getOperand(1)));
3825       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3826         NewI = B.CreateSelect(SI->getCondition(),
3827                               ShrinkOperand(SI->getTrueValue()),
3828                               ShrinkOperand(SI->getFalseValue()));
3829       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3830         switch (CI->getOpcode()) {
3831         default:
3832           llvm_unreachable("Unhandled cast!");
3833         case Instruction::Trunc:
3834           NewI = ShrinkOperand(CI->getOperand(0));
3835           break;
3836         case Instruction::SExt:
3837           NewI = B.CreateSExtOrTrunc(
3838               CI->getOperand(0),
3839               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3840           break;
3841         case Instruction::ZExt:
3842           NewI = B.CreateZExtOrTrunc(
3843               CI->getOperand(0),
3844               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3845           break;
3846         }
3847       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3848         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3849                              ->getNumElements();
3850         auto *O0 = B.CreateZExtOrTrunc(
3851             SI->getOperand(0),
3852             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3853         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3854                              ->getNumElements();
3855         auto *O1 = B.CreateZExtOrTrunc(
3856             SI->getOperand(1),
3857             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3858 
3859         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3860       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3861         // Don't do anything with the operands, just extend the result.
3862         continue;
3863       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3864         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3865                             ->getNumElements();
3866         auto *O0 = B.CreateZExtOrTrunc(
3867             IE->getOperand(0),
3868             FixedVectorType::get(ScalarTruncatedTy, Elements));
3869         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3870         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3871       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3872         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3873                             ->getNumElements();
3874         auto *O0 = B.CreateZExtOrTrunc(
3875             EE->getOperand(0),
3876             FixedVectorType::get(ScalarTruncatedTy, Elements));
3877         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3878       } else {
3879         // If we don't know what to do, be conservative and don't do anything.
3880         continue;
3881       }
3882 
3883       // Lastly, extend the result.
3884       NewI->takeName(cast<Instruction>(I));
3885       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3886       I->replaceAllUsesWith(Res);
3887       cast<Instruction>(I)->eraseFromParent();
3888       Erased.insert(I);
3889       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3890     }
3891   }
3892 
3893   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3894   for (const auto &KV : Cost->getMinimalBitwidths()) {
3895     // If the value wasn't vectorized, we must maintain the original scalar
3896     // type. The absence of the value from VectorLoopValueMap indicates that it
3897     // wasn't vectorized.
3898     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3899       continue;
3900     for (unsigned Part = 0; Part < UF; ++Part) {
3901       Value *I = getOrCreateVectorValue(KV.first, Part);
3902       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3903       if (Inst && Inst->use_empty()) {
3904         Value *NewI = Inst->getOperand(0);
3905         Inst->eraseFromParent();
3906         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3907       }
3908     }
3909   }
3910 }
3911 
3912 void InnerLoopVectorizer::fixVectorizedLoop() {
3913   // Insert truncates and extends for any truncated instructions as hints to
3914   // InstCombine.
3915   if (VF.isVector())
3916     truncateToMinimalBitwidths();
3917 
3918   // Fix widened non-induction PHIs by setting up the PHI operands.
3919   if (OrigPHIsToFix.size()) {
3920     assert(EnableVPlanNativePath &&
3921            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3922     fixNonInductionPHIs();
3923   }
3924 
3925   // At this point every instruction in the original loop is widened to a
3926   // vector form. Now we need to fix the recurrences in the loop. These PHI
3927   // nodes are currently empty because we did not want to introduce cycles.
3928   // This is the second stage of vectorizing recurrences.
3929   fixCrossIterationPHIs();
3930 
3931   // Forget the original basic block.
3932   PSE.getSE()->forgetLoop(OrigLoop);
3933 
3934   // Fix-up external users of the induction variables.
3935   for (auto &Entry : Legal->getInductionVars())
3936     fixupIVUsers(Entry.first, Entry.second,
3937                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3938                  IVEndValues[Entry.first], LoopMiddleBlock);
3939 
3940   fixLCSSAPHIs();
3941   for (Instruction *PI : PredicatedInstructions)
3942     sinkScalarOperands(&*PI);
3943 
3944   // Remove redundant induction instructions.
3945   cse(LoopVectorBody);
3946 
3947   // Set/update profile weights for the vector and remainder loops as original
3948   // loop iterations are now distributed among them. Note that original loop
3949   // represented by LoopScalarBody becomes remainder loop after vectorization.
3950   //
3951   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3952   // end up getting slightly roughened result but that should be OK since
3953   // profile is not inherently precise anyway. Note also possible bypass of
3954   // vector code caused by legality checks is ignored, assigning all the weight
3955   // to the vector loop, optimistically.
3956   //
3957   // For scalable vectorization we can't know at compile time how many iterations
3958   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3959   // vscale of '1'.
3960   setProfileInfoAfterUnrolling(
3961       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3962       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3963 }
3964 
3965 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3966   // In order to support recurrences we need to be able to vectorize Phi nodes.
3967   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3968   // stage #2: We now need to fix the recurrences by adding incoming edges to
3969   // the currently empty PHI nodes. At this point every instruction in the
3970   // original loop is widened to a vector form so we can use them to construct
3971   // the incoming edges.
3972   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3973     // Handle first-order recurrences and reductions that need to be fixed.
3974     if (Legal->isFirstOrderRecurrence(&Phi))
3975       fixFirstOrderRecurrence(&Phi);
3976     else if (Legal->isReductionVariable(&Phi))
3977       fixReduction(&Phi);
3978   }
3979 }
3980 
3981 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3982   // This is the second phase of vectorizing first-order recurrences. An
3983   // overview of the transformation is described below. Suppose we have the
3984   // following loop.
3985   //
3986   //   for (int i = 0; i < n; ++i)
3987   //     b[i] = a[i] - a[i - 1];
3988   //
3989   // There is a first-order recurrence on "a". For this loop, the shorthand
3990   // scalar IR looks like:
3991   //
3992   //   scalar.ph:
3993   //     s_init = a[-1]
3994   //     br scalar.body
3995   //
3996   //   scalar.body:
3997   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3998   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3999   //     s2 = a[i]
4000   //     b[i] = s2 - s1
4001   //     br cond, scalar.body, ...
4002   //
4003   // In this example, s1 is a recurrence because it's value depends on the
4004   // previous iteration. In the first phase of vectorization, we created a
4005   // temporary value for s1. We now complete the vectorization and produce the
4006   // shorthand vector IR shown below (for VF = 4, UF = 1).
4007   //
4008   //   vector.ph:
4009   //     v_init = vector(..., ..., ..., a[-1])
4010   //     br vector.body
4011   //
4012   //   vector.body
4013   //     i = phi [0, vector.ph], [i+4, vector.body]
4014   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4015   //     v2 = a[i, i+1, i+2, i+3];
4016   //     v3 = vector(v1(3), v2(0, 1, 2))
4017   //     b[i, i+1, i+2, i+3] = v2 - v3
4018   //     br cond, vector.body, middle.block
4019   //
4020   //   middle.block:
4021   //     x = v2(3)
4022   //     br scalar.ph
4023   //
4024   //   scalar.ph:
4025   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4026   //     br scalar.body
4027   //
4028   // After execution completes the vector loop, we extract the next value of
4029   // the recurrence (x) to use as the initial value in the scalar loop.
4030 
4031   // Get the original loop preheader and single loop latch.
4032   auto *Preheader = OrigLoop->getLoopPreheader();
4033   auto *Latch = OrigLoop->getLoopLatch();
4034 
4035   // Get the initial and previous values of the scalar recurrence.
4036   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4037   auto *Previous = Phi->getIncomingValueForBlock(Latch);
4038 
4039   // Create a vector from the initial value.
4040   auto *VectorInit = ScalarInit;
4041   if (VF.isVector()) {
4042     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4043     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4044     VectorInit = Builder.CreateInsertElement(
4045         PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4046         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
4047   }
4048 
4049   // We constructed a temporary phi node in the first phase of vectorization.
4050   // This phi node will eventually be deleted.
4051   Builder.SetInsertPoint(
4052       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4053 
4054   // Create a phi node for the new recurrence. The current value will either be
4055   // the initial value inserted into a vector or loop-varying vector value.
4056   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4057   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4058 
4059   // Get the vectorized previous value of the last part UF - 1. It appears last
4060   // among all unrolled iterations, due to the order of their construction.
4061   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4062 
4063   // Find and set the insertion point after the previous value if it is an
4064   // instruction.
4065   BasicBlock::iterator InsertPt;
4066   // Note that the previous value may have been constant-folded so it is not
4067   // guaranteed to be an instruction in the vector loop.
4068   // FIXME: Loop invariant values do not form recurrences. We should deal with
4069   //        them earlier.
4070   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4071     InsertPt = LoopVectorBody->getFirstInsertionPt();
4072   else {
4073     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4074     if (isa<PHINode>(PreviousLastPart))
4075       // If the previous value is a phi node, we should insert after all the phi
4076       // nodes in the block containing the PHI to avoid breaking basic block
4077       // verification. Note that the basic block may be different to
4078       // LoopVectorBody, in case we predicate the loop.
4079       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4080     else
4081       InsertPt = ++PreviousInst->getIterator();
4082   }
4083   Builder.SetInsertPoint(&*InsertPt);
4084 
4085   // We will construct a vector for the recurrence by combining the values for
4086   // the current and previous iterations. This is the required shuffle mask.
4087   assert(!VF.isScalable());
4088   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4089   ShuffleMask[0] = VF.getKnownMinValue() - 1;
4090   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4091     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4092 
4093   // The vector from which to take the initial value for the current iteration
4094   // (actual or unrolled). Initially, this is the vector phi node.
4095   Value *Incoming = VecPhi;
4096 
4097   // Shuffle the current and previous vector and update the vector parts.
4098   for (unsigned Part = 0; Part < UF; ++Part) {
4099     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4100     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4101     auto *Shuffle =
4102         VF.isVector()
4103             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4104             : Incoming;
4105     PhiPart->replaceAllUsesWith(Shuffle);
4106     cast<Instruction>(PhiPart)->eraseFromParent();
4107     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4108     Incoming = PreviousPart;
4109   }
4110 
4111   // Fix the latch value of the new recurrence in the vector loop.
4112   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4113 
4114   // Extract the last vector element in the middle block. This will be the
4115   // initial value for the recurrence when jumping to the scalar loop.
4116   auto *ExtractForScalar = Incoming;
4117   if (VF.isVector()) {
4118     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4119     ExtractForScalar = Builder.CreateExtractElement(
4120         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4121         "vector.recur.extract");
4122   }
4123   // Extract the second last element in the middle block if the
4124   // Phi is used outside the loop. We need to extract the phi itself
4125   // and not the last element (the phi update in the current iteration). This
4126   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4127   // when the scalar loop is not run at all.
4128   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4129   if (VF.isVector())
4130     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4131         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4132         "vector.recur.extract.for.phi");
4133   // When loop is unrolled without vectorizing, initialize
4134   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4135   // `Incoming`. This is analogous to the vectorized case above: extracting the
4136   // second last element when VF > 1.
4137   else if (UF > 1)
4138     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4139 
4140   // Fix the initial value of the original recurrence in the scalar loop.
4141   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4142   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4143   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4144     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4145     Start->addIncoming(Incoming, BB);
4146   }
4147 
4148   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4149   Phi->setName("scalar.recur");
4150 
4151   // Finally, fix users of the recurrence outside the loop. The users will need
4152   // either the last value of the scalar recurrence or the last value of the
4153   // vector recurrence we extracted in the middle block. Since the loop is in
4154   // LCSSA form, we just need to find all the phi nodes for the original scalar
4155   // recurrence in the exit block, and then add an edge for the middle block.
4156   // Note that LCSSA does not imply single entry when the original scalar loop
4157   // had multiple exiting edges (as we always run the last iteration in the
4158   // scalar epilogue); in that case, the exiting path through middle will be
4159   // dynamically dead and the value picked for the phi doesn't matter.
4160   for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4161     if (any_of(LCSSAPhi.incoming_values(),
4162                [Phi](Value *V) { return V == Phi; }))
4163       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4164 }
4165 
4166 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
4167   // Get it's reduction variable descriptor.
4168   assert(Legal->isReductionVariable(Phi) &&
4169          "Unable to find the reduction variable");
4170   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4171 
4172   RecurKind RK = RdxDesc.getRecurrenceKind();
4173   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4174   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4175   setDebugLocFromInst(Builder, ReductionStartValue);
4176   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4177 
4178   // This is the vector-clone of the value that leaves the loop.
4179   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4180 
4181   // Wrap flags are in general invalid after vectorization, clear them.
4182   clearReductionWrapFlags(RdxDesc);
4183 
4184   // Fix the vector-loop phi.
4185 
4186   // Reductions do not have to start at zero. They can start with
4187   // any loop invariant values.
4188   BasicBlock *Latch = OrigLoop->getLoopLatch();
4189   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4190 
4191   for (unsigned Part = 0; Part < UF; ++Part) {
4192     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4193     Value *Val = getOrCreateVectorValue(LoopVal, Part);
4194     cast<PHINode>(VecRdxPhi)
4195       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4196   }
4197 
4198   // Before each round, move the insertion point right between
4199   // the PHIs and the values we are going to write.
4200   // This allows us to write both PHINodes and the extractelement
4201   // instructions.
4202   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4203 
4204   setDebugLocFromInst(Builder, LoopExitInst);
4205 
4206   // If tail is folded by masking, the vector value to leave the loop should be
4207   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4208   // instead of the former. For an inloop reduction the reduction will already
4209   // be predicated, and does not need to be handled here.
4210   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4211     for (unsigned Part = 0; Part < UF; ++Part) {
4212       Value *VecLoopExitInst =
4213           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4214       Value *Sel = nullptr;
4215       for (User *U : VecLoopExitInst->users()) {
4216         if (isa<SelectInst>(U)) {
4217           assert(!Sel && "Reduction exit feeding two selects");
4218           Sel = U;
4219         } else
4220           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4221       }
4222       assert(Sel && "Reduction exit feeds no select");
4223       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4224 
4225       // If the target can create a predicated operator for the reduction at no
4226       // extra cost in the loop (for example a predicated vadd), it can be
4227       // cheaper for the select to remain in the loop than be sunk out of it,
4228       // and so use the select value for the phi instead of the old
4229       // LoopExitValue.
4230       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4231       if (PreferPredicatedReductionSelect ||
4232           TTI->preferPredicatedReductionSelect(
4233               RdxDesc.getOpcode(), Phi->getType(),
4234               TargetTransformInfo::ReductionFlags())) {
4235         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4236         VecRdxPhi->setIncomingValueForBlock(
4237             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4238       }
4239     }
4240   }
4241 
4242   // If the vector reduction can be performed in a smaller type, we truncate
4243   // then extend the loop exit value to enable InstCombine to evaluate the
4244   // entire expression in the smaller type.
4245   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4246     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4247     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4248     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4249     Builder.SetInsertPoint(
4250         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4251     VectorParts RdxParts(UF);
4252     for (unsigned Part = 0; Part < UF; ++Part) {
4253       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4254       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4255       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4256                                         : Builder.CreateZExt(Trunc, VecTy);
4257       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4258            UI != RdxParts[Part]->user_end();)
4259         if (*UI != Trunc) {
4260           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4261           RdxParts[Part] = Extnd;
4262         } else {
4263           ++UI;
4264         }
4265     }
4266     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4267     for (unsigned Part = 0; Part < UF; ++Part) {
4268       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4269       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4270     }
4271   }
4272 
4273   // Reduce all of the unrolled parts into a single vector.
4274   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4275   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4276 
4277   // The middle block terminator has already been assigned a DebugLoc here (the
4278   // OrigLoop's single latch terminator). We want the whole middle block to
4279   // appear to execute on this line because: (a) it is all compiler generated,
4280   // (b) these instructions are always executed after evaluating the latch
4281   // conditional branch, and (c) other passes may add new predecessors which
4282   // terminate on this line. This is the easiest way to ensure we don't
4283   // accidentally cause an extra step back into the loop while debugging.
4284   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4285   for (unsigned Part = 1; Part < UF; ++Part) {
4286     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4287     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4288       // Floating point operations had to be 'fast' to enable the reduction.
4289       ReducedPartRdx = addFastMathFlag(
4290           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4291                               ReducedPartRdx, "bin.rdx"),
4292           RdxDesc.getFastMathFlags());
4293     else
4294       ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4295   }
4296 
4297   // Create the reduction after the loop. Note that inloop reductions create the
4298   // target reduction in the loop using a Reduction recipe.
4299   if (VF.isVector() && !IsInLoopReductionPhi) {
4300     ReducedPartRdx =
4301         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4302     // If the reduction can be performed in a smaller type, we need to extend
4303     // the reduction to the wider type before we branch to the original loop.
4304     if (Phi->getType() != RdxDesc.getRecurrenceType())
4305       ReducedPartRdx =
4306         RdxDesc.isSigned()
4307         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4308         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4309   }
4310 
4311   // Create a phi node that merges control-flow from the backedge-taken check
4312   // block and the middle block.
4313   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4314                                         LoopScalarPreHeader->getTerminator());
4315   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4316     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4317   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4318 
4319   // Now, we need to fix the users of the reduction variable
4320   // inside and outside of the scalar remainder loop.
4321 
4322   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4323   // in the exit blocks.  See comment on analogous loop in
4324   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4325   for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4326     if (any_of(LCSSAPhi.incoming_values(),
4327                [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4328       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4329 
4330   // Fix the scalar loop reduction variable with the incoming reduction sum
4331   // from the vector body and from the backedge value.
4332   int IncomingEdgeBlockIdx =
4333     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4334   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4335   // Pick the other block.
4336   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4337   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4338   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4339 }
4340 
4341 void InnerLoopVectorizer::clearReductionWrapFlags(
4342     RecurrenceDescriptor &RdxDesc) {
4343   RecurKind RK = RdxDesc.getRecurrenceKind();
4344   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4345     return;
4346 
4347   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4348   assert(LoopExitInstr && "null loop exit instruction");
4349   SmallVector<Instruction *, 8> Worklist;
4350   SmallPtrSet<Instruction *, 8> Visited;
4351   Worklist.push_back(LoopExitInstr);
4352   Visited.insert(LoopExitInstr);
4353 
4354   while (!Worklist.empty()) {
4355     Instruction *Cur = Worklist.pop_back_val();
4356     if (isa<OverflowingBinaryOperator>(Cur))
4357       for (unsigned Part = 0; Part < UF; ++Part) {
4358         Value *V = getOrCreateVectorValue(Cur, Part);
4359         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4360       }
4361 
4362     for (User *U : Cur->users()) {
4363       Instruction *UI = cast<Instruction>(U);
4364       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4365           Visited.insert(UI).second)
4366         Worklist.push_back(UI);
4367     }
4368   }
4369 }
4370 
4371 void InnerLoopVectorizer::fixLCSSAPHIs() {
4372   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4373     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4374       // Some phis were already hand updated by the reduction and recurrence
4375       // code above, leave them alone.
4376       continue;
4377 
4378     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4379     // Non-instruction incoming values will have only one value.
4380     unsigned LastLane = 0;
4381     if (isa<Instruction>(IncomingValue))
4382       LastLane = Cost->isUniformAfterVectorization(
4383                      cast<Instruction>(IncomingValue), VF)
4384                      ? 0
4385                      : VF.getKnownMinValue() - 1;
4386     assert((!VF.isScalable() || LastLane == 0) &&
4387            "scalable vectors dont support non-uniform scalars yet");
4388     // Can be a loop invariant incoming value or the last scalar value to be
4389     // extracted from the vectorized loop.
4390     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4391     Value *lastIncomingValue =
4392       getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4393     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4394   }
4395 }
4396 
4397 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4398   // The basic block and loop containing the predicated instruction.
4399   auto *PredBB = PredInst->getParent();
4400   auto *VectorLoop = LI->getLoopFor(PredBB);
4401 
4402   // Initialize a worklist with the operands of the predicated instruction.
4403   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4404 
4405   // Holds instructions that we need to analyze again. An instruction may be
4406   // reanalyzed if we don't yet know if we can sink it or not.
4407   SmallVector<Instruction *, 8> InstsToReanalyze;
4408 
4409   // Returns true if a given use occurs in the predicated block. Phi nodes use
4410   // their operands in their corresponding predecessor blocks.
4411   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4412     auto *I = cast<Instruction>(U.getUser());
4413     BasicBlock *BB = I->getParent();
4414     if (auto *Phi = dyn_cast<PHINode>(I))
4415       BB = Phi->getIncomingBlock(
4416           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4417     return BB == PredBB;
4418   };
4419 
4420   // Iteratively sink the scalarized operands of the predicated instruction
4421   // into the block we created for it. When an instruction is sunk, it's
4422   // operands are then added to the worklist. The algorithm ends after one pass
4423   // through the worklist doesn't sink a single instruction.
4424   bool Changed;
4425   do {
4426     // Add the instructions that need to be reanalyzed to the worklist, and
4427     // reset the changed indicator.
4428     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4429     InstsToReanalyze.clear();
4430     Changed = false;
4431 
4432     while (!Worklist.empty()) {
4433       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4434 
4435       // We can't sink an instruction if it is a phi node, is already in the
4436       // predicated block, is not in the loop, or may have side effects.
4437       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4438           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4439         continue;
4440 
4441       // It's legal to sink the instruction if all its uses occur in the
4442       // predicated block. Otherwise, there's nothing to do yet, and we may
4443       // need to reanalyze the instruction.
4444       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4445         InstsToReanalyze.push_back(I);
4446         continue;
4447       }
4448 
4449       // Move the instruction to the beginning of the predicated block, and add
4450       // it's operands to the worklist.
4451       I->moveBefore(&*PredBB->getFirstInsertionPt());
4452       Worklist.insert(I->op_begin(), I->op_end());
4453 
4454       // The sinking may have enabled other instructions to be sunk, so we will
4455       // need to iterate.
4456       Changed = true;
4457     }
4458   } while (Changed);
4459 }
4460 
4461 void InnerLoopVectorizer::fixNonInductionPHIs() {
4462   for (PHINode *OrigPhi : OrigPHIsToFix) {
4463     PHINode *NewPhi =
4464         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4465     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4466 
4467     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4468         predecessors(OrigPhi->getParent()));
4469     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4470         predecessors(NewPhi->getParent()));
4471     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4472            "Scalar and Vector BB should have the same number of predecessors");
4473 
4474     // The insertion point in Builder may be invalidated by the time we get
4475     // here. Force the Builder insertion point to something valid so that we do
4476     // not run into issues during insertion point restore in
4477     // getOrCreateVectorValue calls below.
4478     Builder.SetInsertPoint(NewPhi);
4479 
4480     // The predecessor order is preserved and we can rely on mapping between
4481     // scalar and vector block predecessors.
4482     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4483       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4484 
4485       // When looking up the new scalar/vector values to fix up, use incoming
4486       // values from original phi.
4487       Value *ScIncV =
4488           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4489 
4490       // Scalar incoming value may need a broadcast
4491       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4492       NewPhi->addIncoming(NewIncV, NewPredBB);
4493     }
4494   }
4495 }
4496 
4497 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4498                                    VPUser &Operands, unsigned UF,
4499                                    ElementCount VF, bool IsPtrLoopInvariant,
4500                                    SmallBitVector &IsIndexLoopInvariant,
4501                                    VPTransformState &State) {
4502   // Construct a vector GEP by widening the operands of the scalar GEP as
4503   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4504   // results in a vector of pointers when at least one operand of the GEP
4505   // is vector-typed. Thus, to keep the representation compact, we only use
4506   // vector-typed operands for loop-varying values.
4507 
4508   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4509     // If we are vectorizing, but the GEP has only loop-invariant operands,
4510     // the GEP we build (by only using vector-typed operands for
4511     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4512     // produce a vector of pointers, we need to either arbitrarily pick an
4513     // operand to broadcast, or broadcast a clone of the original GEP.
4514     // Here, we broadcast a clone of the original.
4515     //
4516     // TODO: If at some point we decide to scalarize instructions having
4517     //       loop-invariant operands, this special case will no longer be
4518     //       required. We would add the scalarization decision to
4519     //       collectLoopScalars() and teach getVectorValue() to broadcast
4520     //       the lane-zero scalar value.
4521     auto *Clone = Builder.Insert(GEP->clone());
4522     for (unsigned Part = 0; Part < UF; ++Part) {
4523       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4524       State.set(VPDef, GEP, EntryPart, Part);
4525       addMetadata(EntryPart, GEP);
4526     }
4527   } else {
4528     // If the GEP has at least one loop-varying operand, we are sure to
4529     // produce a vector of pointers. But if we are only unrolling, we want
4530     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4531     // produce with the code below will be scalar (if VF == 1) or vector
4532     // (otherwise). Note that for the unroll-only case, we still maintain
4533     // values in the vector mapping with initVector, as we do for other
4534     // instructions.
4535     for (unsigned Part = 0; Part < UF; ++Part) {
4536       // The pointer operand of the new GEP. If it's loop-invariant, we
4537       // won't broadcast it.
4538       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4539                                      : State.get(Operands.getOperand(0), Part);
4540 
4541       // Collect all the indices for the new GEP. If any index is
4542       // loop-invariant, we won't broadcast it.
4543       SmallVector<Value *, 4> Indices;
4544       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4545         VPValue *Operand = Operands.getOperand(I);
4546         if (IsIndexLoopInvariant[I - 1])
4547           Indices.push_back(State.get(Operand, {0, 0}));
4548         else
4549           Indices.push_back(State.get(Operand, Part));
4550       }
4551 
4552       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4553       // but it should be a vector, otherwise.
4554       auto *NewGEP =
4555           GEP->isInBounds()
4556               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4557                                           Indices)
4558               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4559       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4560              "NewGEP is not a pointer vector");
4561       State.set(VPDef, GEP, NewGEP, Part);
4562       addMetadata(NewGEP, GEP);
4563     }
4564   }
4565 }
4566 
4567 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4568                                               RecurrenceDescriptor *RdxDesc,
4569                                               Value *StartV, unsigned UF,
4570                                               ElementCount VF) {
4571   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4572   PHINode *P = cast<PHINode>(PN);
4573   if (EnableVPlanNativePath) {
4574     // Currently we enter here in the VPlan-native path for non-induction
4575     // PHIs where all control flow is uniform. We simply widen these PHIs.
4576     // Create a vector phi with no operands - the vector phi operands will be
4577     // set at the end of vector code generation.
4578     Type *VecTy =
4579         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4580     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4581     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4582     OrigPHIsToFix.push_back(P);
4583 
4584     return;
4585   }
4586 
4587   assert(PN->getParent() == OrigLoop->getHeader() &&
4588          "Non-header phis should have been handled elsewhere");
4589 
4590   // In order to support recurrences we need to be able to vectorize Phi nodes.
4591   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4592   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4593   // this value when we vectorize all of the instructions that use the PHI.
4594   if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
4595     Value *Iden = nullptr;
4596     bool ScalarPHI =
4597         (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4598     Type *VecTy =
4599         ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4600 
4601     if (RdxDesc) {
4602       assert(Legal->isReductionVariable(P) && StartV &&
4603              "RdxDesc should only be set for reduction variables; in that case "
4604              "a StartV is also required");
4605       RecurKind RK = RdxDesc->getRecurrenceKind();
4606       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
4607         // MinMax reduction have the start value as their identify.
4608         if (ScalarPHI) {
4609           Iden = StartV;
4610         } else {
4611           IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4612           Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4613           StartV = Iden = Builder.CreateVectorSplat(VF, StartV, "minmax.ident");
4614         }
4615       } else {
4616         Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
4617             RK, VecTy->getScalarType());
4618         Iden = IdenC;
4619 
4620         if (!ScalarPHI) {
4621           Iden = ConstantVector::getSplat(VF, IdenC);
4622           IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4623           Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4624           Constant *Zero = Builder.getInt32(0);
4625           StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
4626         }
4627       }
4628     }
4629 
4630     for (unsigned Part = 0; Part < UF; ++Part) {
4631       // This is phase one of vectorizing PHIs.
4632       Value *EntryPart = PHINode::Create(
4633           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4634       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4635       if (StartV) {
4636         // Make sure to add the reduction start value only to the
4637         // first unroll part.
4638         Value *StartVal = (Part == 0) ? StartV : Iden;
4639         cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
4640       }
4641     }
4642     return;
4643   }
4644 
4645   assert(!Legal->isReductionVariable(P) &&
4646          "reductions should be handled above");
4647 
4648   setDebugLocFromInst(Builder, P);
4649 
4650   // This PHINode must be an induction variable.
4651   // Make sure that we know about it.
4652   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4653 
4654   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4655   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4656 
4657   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4658   // which can be found from the original scalar operations.
4659   switch (II.getKind()) {
4660   case InductionDescriptor::IK_NoInduction:
4661     llvm_unreachable("Unknown induction");
4662   case InductionDescriptor::IK_IntInduction:
4663   case InductionDescriptor::IK_FpInduction:
4664     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4665   case InductionDescriptor::IK_PtrInduction: {
4666     // Handle the pointer induction variable case.
4667     assert(P->getType()->isPointerTy() && "Unexpected type.");
4668 
4669     if (Cost->isScalarAfterVectorization(P, VF)) {
4670       // This is the normalized GEP that starts counting at zero.
4671       Value *PtrInd =
4672           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4673       // Determine the number of scalars we need to generate for each unroll
4674       // iteration. If the instruction is uniform, we only need to generate the
4675       // first lane. Otherwise, we generate all VF values.
4676       unsigned Lanes =
4677           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4678       for (unsigned Part = 0; Part < UF; ++Part) {
4679         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4680           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4681                                            Lane + Part * VF.getKnownMinValue());
4682           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4683           Value *SclrGep =
4684               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4685           SclrGep->setName("next.gep");
4686           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4687         }
4688       }
4689       return;
4690     }
4691     assert(isa<SCEVConstant>(II.getStep()) &&
4692            "Induction step not a SCEV constant!");
4693     Type *PhiType = II.getStep()->getType();
4694 
4695     // Build a pointer phi
4696     Value *ScalarStartValue = II.getStartValue();
4697     Type *ScStValueType = ScalarStartValue->getType();
4698     PHINode *NewPointerPhi =
4699         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4700     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4701 
4702     // A pointer induction, performed by using a gep
4703     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4704     Instruction *InductionLoc = LoopLatch->getTerminator();
4705     const SCEV *ScalarStep = II.getStep();
4706     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4707     Value *ScalarStepValue =
4708         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4709     Value *InductionGEP = GetElementPtrInst::Create(
4710         ScStValueType->getPointerElementType(), NewPointerPhi,
4711         Builder.CreateMul(
4712             ScalarStepValue,
4713             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4714         "ptr.ind", InductionLoc);
4715     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4716 
4717     // Create UF many actual address geps that use the pointer
4718     // phi as base and a vectorized version of the step value
4719     // (<step*0, ..., step*N>) as offset.
4720     for (unsigned Part = 0; Part < UF; ++Part) {
4721       SmallVector<Constant *, 8> Indices;
4722       // Create a vector of consecutive numbers from zero to VF.
4723       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4724         Indices.push_back(
4725             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4726       Constant *StartOffset = ConstantVector::get(Indices);
4727 
4728       Value *GEP = Builder.CreateGEP(
4729           ScStValueType->getPointerElementType(), NewPointerPhi,
4730           Builder.CreateMul(
4731               StartOffset,
4732               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4733               "vector.gep"));
4734       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4735     }
4736   }
4737   }
4738 }
4739 
4740 /// A helper function for checking whether an integer division-related
4741 /// instruction may divide by zero (in which case it must be predicated if
4742 /// executed conditionally in the scalar code).
4743 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4744 /// Non-zero divisors that are non compile-time constants will not be
4745 /// converted into multiplication, so we will still end up scalarizing
4746 /// the division, but can do so w/o predication.
4747 static bool mayDivideByZero(Instruction &I) {
4748   assert((I.getOpcode() == Instruction::UDiv ||
4749           I.getOpcode() == Instruction::SDiv ||
4750           I.getOpcode() == Instruction::URem ||
4751           I.getOpcode() == Instruction::SRem) &&
4752          "Unexpected instruction");
4753   Value *Divisor = I.getOperand(1);
4754   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4755   return !CInt || CInt->isZero();
4756 }
4757 
4758 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4759                                            VPUser &User,
4760                                            VPTransformState &State) {
4761   switch (I.getOpcode()) {
4762   case Instruction::Call:
4763   case Instruction::Br:
4764   case Instruction::PHI:
4765   case Instruction::GetElementPtr:
4766   case Instruction::Select:
4767     llvm_unreachable("This instruction is handled by a different recipe.");
4768   case Instruction::UDiv:
4769   case Instruction::SDiv:
4770   case Instruction::SRem:
4771   case Instruction::URem:
4772   case Instruction::Add:
4773   case Instruction::FAdd:
4774   case Instruction::Sub:
4775   case Instruction::FSub:
4776   case Instruction::FNeg:
4777   case Instruction::Mul:
4778   case Instruction::FMul:
4779   case Instruction::FDiv:
4780   case Instruction::FRem:
4781   case Instruction::Shl:
4782   case Instruction::LShr:
4783   case Instruction::AShr:
4784   case Instruction::And:
4785   case Instruction::Or:
4786   case Instruction::Xor: {
4787     // Just widen unops and binops.
4788     setDebugLocFromInst(Builder, &I);
4789 
4790     for (unsigned Part = 0; Part < UF; ++Part) {
4791       SmallVector<Value *, 2> Ops;
4792       for (VPValue *VPOp : User.operands())
4793         Ops.push_back(State.get(VPOp, Part));
4794 
4795       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4796 
4797       if (auto *VecOp = dyn_cast<Instruction>(V))
4798         VecOp->copyIRFlags(&I);
4799 
4800       // Use this vector value for all users of the original instruction.
4801       State.set(Def, &I, V, Part);
4802       addMetadata(V, &I);
4803     }
4804 
4805     break;
4806   }
4807   case Instruction::ICmp:
4808   case Instruction::FCmp: {
4809     // Widen compares. Generate vector compares.
4810     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4811     auto *Cmp = cast<CmpInst>(&I);
4812     setDebugLocFromInst(Builder, Cmp);
4813     for (unsigned Part = 0; Part < UF; ++Part) {
4814       Value *A = State.get(User.getOperand(0), Part);
4815       Value *B = State.get(User.getOperand(1), Part);
4816       Value *C = nullptr;
4817       if (FCmp) {
4818         // Propagate fast math flags.
4819         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4820         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4821         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4822       } else {
4823         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4824       }
4825       State.set(Def, &I, C, Part);
4826       addMetadata(C, &I);
4827     }
4828 
4829     break;
4830   }
4831 
4832   case Instruction::ZExt:
4833   case Instruction::SExt:
4834   case Instruction::FPToUI:
4835   case Instruction::FPToSI:
4836   case Instruction::FPExt:
4837   case Instruction::PtrToInt:
4838   case Instruction::IntToPtr:
4839   case Instruction::SIToFP:
4840   case Instruction::UIToFP:
4841   case Instruction::Trunc:
4842   case Instruction::FPTrunc:
4843   case Instruction::BitCast: {
4844     auto *CI = cast<CastInst>(&I);
4845     setDebugLocFromInst(Builder, CI);
4846 
4847     /// Vectorize casts.
4848     Type *DestTy =
4849         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4850 
4851     for (unsigned Part = 0; Part < UF; ++Part) {
4852       Value *A = State.get(User.getOperand(0), Part);
4853       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4854       State.set(Def, &I, Cast, Part);
4855       addMetadata(Cast, &I);
4856     }
4857     break;
4858   }
4859   default:
4860     // This instruction is not vectorized by simple widening.
4861     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4862     llvm_unreachable("Unhandled instruction!");
4863   } // end of switch.
4864 }
4865 
4866 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4867                                                VPUser &ArgOperands,
4868                                                VPTransformState &State) {
4869   assert(!isa<DbgInfoIntrinsic>(I) &&
4870          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4871   setDebugLocFromInst(Builder, &I);
4872 
4873   Module *M = I.getParent()->getParent()->getParent();
4874   auto *CI = cast<CallInst>(&I);
4875 
4876   SmallVector<Type *, 4> Tys;
4877   for (Value *ArgOperand : CI->arg_operands())
4878     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4879 
4880   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4881 
4882   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4883   // version of the instruction.
4884   // Is it beneficial to perform intrinsic call compared to lib call?
4885   bool NeedToScalarize = false;
4886   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4887   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4888   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4889   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4890          "Instruction should be scalarized elsewhere.");
4891   assert(IntrinsicCost.isValid() && CallCost.isValid() &&
4892          "Cannot have invalid costs while widening");
4893 
4894   for (unsigned Part = 0; Part < UF; ++Part) {
4895     SmallVector<Value *, 4> Args;
4896     for (auto &I : enumerate(ArgOperands.operands())) {
4897       // Some intrinsics have a scalar argument - don't replace it with a
4898       // vector.
4899       Value *Arg;
4900       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4901         Arg = State.get(I.value(), Part);
4902       else
4903         Arg = State.get(I.value(), {0, 0});
4904       Args.push_back(Arg);
4905     }
4906 
4907     Function *VectorF;
4908     if (UseVectorIntrinsic) {
4909       // Use vector version of the intrinsic.
4910       Type *TysForDecl[] = {CI->getType()};
4911       if (VF.isVector()) {
4912         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4913         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4914       }
4915       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4916       assert(VectorF && "Can't retrieve vector intrinsic.");
4917     } else {
4918       // Use vector version of the function call.
4919       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4920 #ifndef NDEBUG
4921       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4922              "Can't create vector function.");
4923 #endif
4924         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4925     }
4926       SmallVector<OperandBundleDef, 1> OpBundles;
4927       CI->getOperandBundlesAsDefs(OpBundles);
4928       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4929 
4930       if (isa<FPMathOperator>(V))
4931         V->copyFastMathFlags(CI);
4932 
4933       State.set(Def, &I, V, Part);
4934       addMetadata(V, &I);
4935   }
4936 }
4937 
4938 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4939                                                  VPUser &Operands,
4940                                                  bool InvariantCond,
4941                                                  VPTransformState &State) {
4942   setDebugLocFromInst(Builder, &I);
4943 
4944   // The condition can be loop invariant  but still defined inside the
4945   // loop. This means that we can't just use the original 'cond' value.
4946   // We have to take the 'vectorized' value and pick the first lane.
4947   // Instcombine will make this a no-op.
4948   auto *InvarCond =
4949       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4950 
4951   for (unsigned Part = 0; Part < UF; ++Part) {
4952     Value *Cond =
4953         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4954     Value *Op0 = State.get(Operands.getOperand(1), Part);
4955     Value *Op1 = State.get(Operands.getOperand(2), Part);
4956     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4957     State.set(VPDef, &I, Sel, Part);
4958     addMetadata(Sel, &I);
4959   }
4960 }
4961 
4962 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4963   // We should not collect Scalars more than once per VF. Right now, this
4964   // function is called from collectUniformsAndScalars(), which already does
4965   // this check. Collecting Scalars for VF=1 does not make any sense.
4966   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4967          "This function should not be visited twice for the same VF");
4968 
4969   SmallSetVector<Instruction *, 8> Worklist;
4970 
4971   // These sets are used to seed the analysis with pointers used by memory
4972   // accesses that will remain scalar.
4973   SmallSetVector<Instruction *, 8> ScalarPtrs;
4974   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4975   auto *Latch = TheLoop->getLoopLatch();
4976 
4977   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4978   // The pointer operands of loads and stores will be scalar as long as the
4979   // memory access is not a gather or scatter operation. The value operand of a
4980   // store will remain scalar if the store is scalarized.
4981   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4982     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4983     assert(WideningDecision != CM_Unknown &&
4984            "Widening decision should be ready at this moment");
4985     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4986       if (Ptr == Store->getValueOperand())
4987         return WideningDecision == CM_Scalarize;
4988     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4989            "Ptr is neither a value or pointer operand");
4990     return WideningDecision != CM_GatherScatter;
4991   };
4992 
4993   // A helper that returns true if the given value is a bitcast or
4994   // getelementptr instruction contained in the loop.
4995   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4996     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4997             isa<GetElementPtrInst>(V)) &&
4998            !TheLoop->isLoopInvariant(V);
4999   };
5000 
5001   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
5002     if (!isa<PHINode>(Ptr) ||
5003         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
5004       return false;
5005     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
5006     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
5007       return false;
5008     return isScalarUse(MemAccess, Ptr);
5009   };
5010 
5011   // A helper that evaluates a memory access's use of a pointer. If the
5012   // pointer is actually the pointer induction of a loop, it is being
5013   // inserted into Worklist. If the use will be a scalar use, and the
5014   // pointer is only used by memory accesses, we place the pointer in
5015   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5016   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5017     if (isScalarPtrInduction(MemAccess, Ptr)) {
5018       Worklist.insert(cast<Instruction>(Ptr));
5019       Instruction *Update = cast<Instruction>(
5020           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5021       Worklist.insert(Update);
5022       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
5023                         << "\n");
5024       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
5025                         << "\n");
5026       return;
5027     }
5028     // We only care about bitcast and getelementptr instructions contained in
5029     // the loop.
5030     if (!isLoopVaryingBitCastOrGEP(Ptr))
5031       return;
5032 
5033     // If the pointer has already been identified as scalar (e.g., if it was
5034     // also identified as uniform), there's nothing to do.
5035     auto *I = cast<Instruction>(Ptr);
5036     if (Worklist.count(I))
5037       return;
5038 
5039     // If the use of the pointer will be a scalar use, and all users of the
5040     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5041     // place the pointer in PossibleNonScalarPtrs.
5042     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5043           return isa<LoadInst>(U) || isa<StoreInst>(U);
5044         }))
5045       ScalarPtrs.insert(I);
5046     else
5047       PossibleNonScalarPtrs.insert(I);
5048   };
5049 
5050   // We seed the scalars analysis with three classes of instructions: (1)
5051   // instructions marked uniform-after-vectorization and (2) bitcast,
5052   // getelementptr and (pointer) phi instructions used by memory accesses
5053   // requiring a scalar use.
5054   //
5055   // (1) Add to the worklist all instructions that have been identified as
5056   // uniform-after-vectorization.
5057   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5058 
5059   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5060   // memory accesses requiring a scalar use. The pointer operands of loads and
5061   // stores will be scalar as long as the memory accesses is not a gather or
5062   // scatter operation. The value operand of a store will remain scalar if the
5063   // store is scalarized.
5064   for (auto *BB : TheLoop->blocks())
5065     for (auto &I : *BB) {
5066       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5067         evaluatePtrUse(Load, Load->getPointerOperand());
5068       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5069         evaluatePtrUse(Store, Store->getPointerOperand());
5070         evaluatePtrUse(Store, Store->getValueOperand());
5071       }
5072     }
5073   for (auto *I : ScalarPtrs)
5074     if (!PossibleNonScalarPtrs.count(I)) {
5075       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5076       Worklist.insert(I);
5077     }
5078 
5079   // Insert the forced scalars.
5080   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5081   // induction variable when the PHI user is scalarized.
5082   auto ForcedScalar = ForcedScalars.find(VF);
5083   if (ForcedScalar != ForcedScalars.end())
5084     for (auto *I : ForcedScalar->second)
5085       Worklist.insert(I);
5086 
5087   // Expand the worklist by looking through any bitcasts and getelementptr
5088   // instructions we've already identified as scalar. This is similar to the
5089   // expansion step in collectLoopUniforms(); however, here we're only
5090   // expanding to include additional bitcasts and getelementptr instructions.
5091   unsigned Idx = 0;
5092   while (Idx != Worklist.size()) {
5093     Instruction *Dst = Worklist[Idx++];
5094     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5095       continue;
5096     auto *Src = cast<Instruction>(Dst->getOperand(0));
5097     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5098           auto *J = cast<Instruction>(U);
5099           return !TheLoop->contains(J) || Worklist.count(J) ||
5100                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5101                   isScalarUse(J, Src));
5102         })) {
5103       Worklist.insert(Src);
5104       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5105     }
5106   }
5107 
5108   // An induction variable will remain scalar if all users of the induction
5109   // variable and induction variable update remain scalar.
5110   for (auto &Induction : Legal->getInductionVars()) {
5111     auto *Ind = Induction.first;
5112     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5113 
5114     // If tail-folding is applied, the primary induction variable will be used
5115     // to feed a vector compare.
5116     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5117       continue;
5118 
5119     // Determine if all users of the induction variable are scalar after
5120     // vectorization.
5121     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5122       auto *I = cast<Instruction>(U);
5123       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5124     });
5125     if (!ScalarInd)
5126       continue;
5127 
5128     // Determine if all users of the induction variable update instruction are
5129     // scalar after vectorization.
5130     auto ScalarIndUpdate =
5131         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5132           auto *I = cast<Instruction>(U);
5133           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5134         });
5135     if (!ScalarIndUpdate)
5136       continue;
5137 
5138     // The induction variable and its update instruction will remain scalar.
5139     Worklist.insert(Ind);
5140     Worklist.insert(IndUpdate);
5141     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5142     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5143                       << "\n");
5144   }
5145 
5146   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5147 }
5148 
5149 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
5150                                                          ElementCount VF) {
5151   if (!blockNeedsPredication(I->getParent()))
5152     return false;
5153   switch(I->getOpcode()) {
5154   default:
5155     break;
5156   case Instruction::Load:
5157   case Instruction::Store: {
5158     if (!Legal->isMaskRequired(I))
5159       return false;
5160     auto *Ptr = getLoadStorePointerOperand(I);
5161     auto *Ty = getMemInstValueType(I);
5162     // We have already decided how to vectorize this instruction, get that
5163     // result.
5164     if (VF.isVector()) {
5165       InstWidening WideningDecision = getWideningDecision(I, VF);
5166       assert(WideningDecision != CM_Unknown &&
5167              "Widening decision should be ready at this moment");
5168       return WideningDecision == CM_Scalarize;
5169     }
5170     const Align Alignment = getLoadStoreAlignment(I);
5171     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5172                                 isLegalMaskedGather(Ty, Alignment))
5173                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5174                                 isLegalMaskedScatter(Ty, Alignment));
5175   }
5176   case Instruction::UDiv:
5177   case Instruction::SDiv:
5178   case Instruction::SRem:
5179   case Instruction::URem:
5180     return mayDivideByZero(*I);
5181   }
5182   return false;
5183 }
5184 
5185 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5186     Instruction *I, ElementCount VF) {
5187   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5188   assert(getWideningDecision(I, VF) == CM_Unknown &&
5189          "Decision should not be set yet.");
5190   auto *Group = getInterleavedAccessGroup(I);
5191   assert(Group && "Must have a group.");
5192 
5193   // If the instruction's allocated size doesn't equal it's type size, it
5194   // requires padding and will be scalarized.
5195   auto &DL = I->getModule()->getDataLayout();
5196   auto *ScalarTy = getMemInstValueType(I);
5197   if (hasIrregularType(ScalarTy, DL, VF))
5198     return false;
5199 
5200   // Check if masking is required.
5201   // A Group may need masking for one of two reasons: it resides in a block that
5202   // needs predication, or it was decided to use masking to deal with gaps.
5203   bool PredicatedAccessRequiresMasking =
5204       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5205   bool AccessWithGapsRequiresMasking =
5206       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5207   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5208     return true;
5209 
5210   // If masked interleaving is required, we expect that the user/target had
5211   // enabled it, because otherwise it either wouldn't have been created or
5212   // it should have been invalidated by the CostModel.
5213   assert(useMaskedInterleavedAccesses(TTI) &&
5214          "Masked interleave-groups for predicated accesses are not enabled.");
5215 
5216   auto *Ty = getMemInstValueType(I);
5217   const Align Alignment = getLoadStoreAlignment(I);
5218   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5219                           : TTI.isLegalMaskedStore(Ty, Alignment);
5220 }
5221 
5222 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5223     Instruction *I, ElementCount VF) {
5224   // Get and ensure we have a valid memory instruction.
5225   LoadInst *LI = dyn_cast<LoadInst>(I);
5226   StoreInst *SI = dyn_cast<StoreInst>(I);
5227   assert((LI || SI) && "Invalid memory instruction");
5228 
5229   auto *Ptr = getLoadStorePointerOperand(I);
5230 
5231   // In order to be widened, the pointer should be consecutive, first of all.
5232   if (!Legal->isConsecutivePtr(Ptr))
5233     return false;
5234 
5235   // If the instruction is a store located in a predicated block, it will be
5236   // scalarized.
5237   if (isScalarWithPredication(I))
5238     return false;
5239 
5240   // If the instruction's allocated size doesn't equal it's type size, it
5241   // requires padding and will be scalarized.
5242   auto &DL = I->getModule()->getDataLayout();
5243   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5244   if (hasIrregularType(ScalarTy, DL, VF))
5245     return false;
5246 
5247   return true;
5248 }
5249 
5250 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5251   // We should not collect Uniforms more than once per VF. Right now,
5252   // this function is called from collectUniformsAndScalars(), which
5253   // already does this check. Collecting Uniforms for VF=1 does not make any
5254   // sense.
5255 
5256   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5257          "This function should not be visited twice for the same VF");
5258 
5259   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5260   // not analyze again.  Uniforms.count(VF) will return 1.
5261   Uniforms[VF].clear();
5262 
5263   // We now know that the loop is vectorizable!
5264   // Collect instructions inside the loop that will remain uniform after
5265   // vectorization.
5266 
5267   // Global values, params and instructions outside of current loop are out of
5268   // scope.
5269   auto isOutOfScope = [&](Value *V) -> bool {
5270     Instruction *I = dyn_cast<Instruction>(V);
5271     return (!I || !TheLoop->contains(I));
5272   };
5273 
5274   SetVector<Instruction *> Worklist;
5275   BasicBlock *Latch = TheLoop->getLoopLatch();
5276 
5277   // Instructions that are scalar with predication must not be considered
5278   // uniform after vectorization, because that would create an erroneous
5279   // replicating region where only a single instance out of VF should be formed.
5280   // TODO: optimize such seldom cases if found important, see PR40816.
5281   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5282     if (isOutOfScope(I)) {
5283       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5284                         << *I << "\n");
5285       return;
5286     }
5287     if (isScalarWithPredication(I, VF)) {
5288       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5289                         << *I << "\n");
5290       return;
5291     }
5292     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5293     Worklist.insert(I);
5294   };
5295 
5296   // Start with the conditional branch. If the branch condition is an
5297   // instruction contained in the loop that is only used by the branch, it is
5298   // uniform.
5299   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5300   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5301     addToWorklistIfAllowed(Cmp);
5302 
5303   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5304     InstWidening WideningDecision = getWideningDecision(I, VF);
5305     assert(WideningDecision != CM_Unknown &&
5306            "Widening decision should be ready at this moment");
5307 
5308     // A uniform memory op is itself uniform.  We exclude uniform stores
5309     // here as they demand the last lane, not the first one.
5310     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5311       assert(WideningDecision == CM_Scalarize);
5312       return true;
5313     }
5314 
5315     return (WideningDecision == CM_Widen ||
5316             WideningDecision == CM_Widen_Reverse ||
5317             WideningDecision == CM_Interleave);
5318   };
5319 
5320 
5321   // Returns true if Ptr is the pointer operand of a memory access instruction
5322   // I, and I is known to not require scalarization.
5323   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5324     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5325   };
5326 
5327   // Holds a list of values which are known to have at least one uniform use.
5328   // Note that there may be other uses which aren't uniform.  A "uniform use"
5329   // here is something which only demands lane 0 of the unrolled iterations;
5330   // it does not imply that all lanes produce the same value (e.g. this is not
5331   // the usual meaning of uniform)
5332   SmallPtrSet<Value *, 8> HasUniformUse;
5333 
5334   // Scan the loop for instructions which are either a) known to have only
5335   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5336   for (auto *BB : TheLoop->blocks())
5337     for (auto &I : *BB) {
5338       // If there's no pointer operand, there's nothing to do.
5339       auto *Ptr = getLoadStorePointerOperand(&I);
5340       if (!Ptr)
5341         continue;
5342 
5343       // A uniform memory op is itself uniform.  We exclude uniform stores
5344       // here as they demand the last lane, not the first one.
5345       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5346         addToWorklistIfAllowed(&I);
5347 
5348       if (isUniformDecision(&I, VF)) {
5349         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5350         HasUniformUse.insert(Ptr);
5351       }
5352     }
5353 
5354   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5355   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5356   // disallows uses outside the loop as well.
5357   for (auto *V : HasUniformUse) {
5358     if (isOutOfScope(V))
5359       continue;
5360     auto *I = cast<Instruction>(V);
5361     auto UsersAreMemAccesses =
5362       llvm::all_of(I->users(), [&](User *U) -> bool {
5363         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5364       });
5365     if (UsersAreMemAccesses)
5366       addToWorklistIfAllowed(I);
5367   }
5368 
5369   // Expand Worklist in topological order: whenever a new instruction
5370   // is added , its users should be already inside Worklist.  It ensures
5371   // a uniform instruction will only be used by uniform instructions.
5372   unsigned idx = 0;
5373   while (idx != Worklist.size()) {
5374     Instruction *I = Worklist[idx++];
5375 
5376     for (auto OV : I->operand_values()) {
5377       // isOutOfScope operands cannot be uniform instructions.
5378       if (isOutOfScope(OV))
5379         continue;
5380       // First order recurrence Phi's should typically be considered
5381       // non-uniform.
5382       auto *OP = dyn_cast<PHINode>(OV);
5383       if (OP && Legal->isFirstOrderRecurrence(OP))
5384         continue;
5385       // If all the users of the operand are uniform, then add the
5386       // operand into the uniform worklist.
5387       auto *OI = cast<Instruction>(OV);
5388       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5389             auto *J = cast<Instruction>(U);
5390             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5391           }))
5392         addToWorklistIfAllowed(OI);
5393     }
5394   }
5395 
5396   // For an instruction to be added into Worklist above, all its users inside
5397   // the loop should also be in Worklist. However, this condition cannot be
5398   // true for phi nodes that form a cyclic dependence. We must process phi
5399   // nodes separately. An induction variable will remain uniform if all users
5400   // of the induction variable and induction variable update remain uniform.
5401   // The code below handles both pointer and non-pointer induction variables.
5402   for (auto &Induction : Legal->getInductionVars()) {
5403     auto *Ind = Induction.first;
5404     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5405 
5406     // Determine if all users of the induction variable are uniform after
5407     // vectorization.
5408     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5409       auto *I = cast<Instruction>(U);
5410       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5411              isVectorizedMemAccessUse(I, Ind);
5412     });
5413     if (!UniformInd)
5414       continue;
5415 
5416     // Determine if all users of the induction variable update instruction are
5417     // uniform after vectorization.
5418     auto UniformIndUpdate =
5419         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5420           auto *I = cast<Instruction>(U);
5421           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5422                  isVectorizedMemAccessUse(I, IndUpdate);
5423         });
5424     if (!UniformIndUpdate)
5425       continue;
5426 
5427     // The induction variable and its update instruction will remain uniform.
5428     addToWorklistIfAllowed(Ind);
5429     addToWorklistIfAllowed(IndUpdate);
5430   }
5431 
5432   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5433 }
5434 
5435 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5436   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5437 
5438   if (Legal->getRuntimePointerChecking()->Need) {
5439     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5440         "runtime pointer checks needed. Enable vectorization of this "
5441         "loop with '#pragma clang loop vectorize(enable)' when "
5442         "compiling with -Os/-Oz",
5443         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5444     return true;
5445   }
5446 
5447   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5448     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5449         "runtime SCEV checks needed. Enable vectorization of this "
5450         "loop with '#pragma clang loop vectorize(enable)' when "
5451         "compiling with -Os/-Oz",
5452         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5453     return true;
5454   }
5455 
5456   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5457   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5458     reportVectorizationFailure("Runtime stride check for small trip count",
5459         "runtime stride == 1 checks needed. Enable vectorization of "
5460         "this loop without such check by compiling with -Os/-Oz",
5461         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5462     return true;
5463   }
5464 
5465   return false;
5466 }
5467 
5468 Optional<ElementCount>
5469 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5470   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5471     // TODO: It may by useful to do since it's still likely to be dynamically
5472     // uniform if the target can skip.
5473     reportVectorizationFailure(
5474         "Not inserting runtime ptr check for divergent target",
5475         "runtime pointer checks needed. Not enabled for divergent target",
5476         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5477     return None;
5478   }
5479 
5480   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5481   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5482   if (TC == 1) {
5483     reportVectorizationFailure("Single iteration (non) loop",
5484         "loop trip count is one, irrelevant for vectorization",
5485         "SingleIterationLoop", ORE, TheLoop);
5486     return None;
5487   }
5488 
5489   ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5490 
5491   switch (ScalarEpilogueStatus) {
5492   case CM_ScalarEpilogueAllowed:
5493     return MaxVF;
5494   case CM_ScalarEpilogueNotAllowedUsePredicate:
5495     LLVM_FALLTHROUGH;
5496   case CM_ScalarEpilogueNotNeededUsePredicate:
5497     LLVM_DEBUG(
5498         dbgs() << "LV: vector predicate hint/switch found.\n"
5499                << "LV: Not allowing scalar epilogue, creating predicated "
5500                << "vector loop.\n");
5501     break;
5502   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5503     // fallthrough as a special case of OptForSize
5504   case CM_ScalarEpilogueNotAllowedOptSize:
5505     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5506       LLVM_DEBUG(
5507           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5508     else
5509       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5510                         << "count.\n");
5511 
5512     // Bail if runtime checks are required, which are not good when optimising
5513     // for size.
5514     if (runtimeChecksRequired())
5515       return None;
5516 
5517     break;
5518   }
5519 
5520   // The only loops we can vectorize without a scalar epilogue, are loops with
5521   // a bottom-test and a single exiting block. We'd have to handle the fact
5522   // that not every instruction executes on the last iteration.  This will
5523   // require a lane mask which varies through the vector loop body.  (TODO)
5524   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5525     // If there was a tail-folding hint/switch, but we can't fold the tail by
5526     // masking, fallback to a vectorization with a scalar epilogue.
5527     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5528       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5529                            "scalar epilogue instead.\n");
5530       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5531       return MaxVF;
5532     }
5533     return None;
5534   }
5535 
5536   // Now try the tail folding
5537 
5538   // Invalidate interleave groups that require an epilogue if we can't mask
5539   // the interleave-group.
5540   if (!useMaskedInterleavedAccesses(TTI)) {
5541     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5542            "No decisions should have been taken at this point");
5543     // Note: There is no need to invalidate any cost modeling decisions here, as
5544     // non where taken so far.
5545     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5546   }
5547 
5548   assert(!MaxVF.isScalable() &&
5549          "Scalable vectors do not yet support tail folding");
5550   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5551          "MaxVF must be a power of 2");
5552   unsigned MaxVFtimesIC =
5553       UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5554   // Avoid tail folding if the trip count is known to be a multiple of any VF we
5555   // chose.
5556   ScalarEvolution *SE = PSE.getSE();
5557   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5558   const SCEV *ExitCount = SE->getAddExpr(
5559       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5560   const SCEV *Rem = SE->getURemExpr(
5561       ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5562   if (Rem->isZero()) {
5563     // Accept MaxVF if we do not have a tail.
5564     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5565     return MaxVF;
5566   }
5567 
5568   // If we don't know the precise trip count, or if the trip count that we
5569   // found modulo the vectorization factor is not zero, try to fold the tail
5570   // by masking.
5571   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5572   if (Legal->prepareToFoldTailByMasking()) {
5573     FoldTailByMasking = true;
5574     return MaxVF;
5575   }
5576 
5577   // If there was a tail-folding hint/switch, but we can't fold the tail by
5578   // masking, fallback to a vectorization with a scalar epilogue.
5579   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5580     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5581                          "scalar epilogue instead.\n");
5582     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5583     return MaxVF;
5584   }
5585 
5586   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5587     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5588     return None;
5589   }
5590 
5591   if (TC == 0) {
5592     reportVectorizationFailure(
5593         "Unable to calculate the loop count due to complex control flow",
5594         "unable to calculate the loop count due to complex control flow",
5595         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5596     return None;
5597   }
5598 
5599   reportVectorizationFailure(
5600       "Cannot optimize for size and vectorize at the same time.",
5601       "cannot optimize for size and vectorize at the same time. "
5602       "Enable vectorization of this loop with '#pragma clang loop "
5603       "vectorize(enable)' when compiling with -Os/-Oz",
5604       "NoTailLoopWithOptForSize", ORE, TheLoop);
5605   return None;
5606 }
5607 
5608 ElementCount
5609 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5610                                                  ElementCount UserVF) {
5611   bool IgnoreScalableUserVF = UserVF.isScalable() &&
5612                               !TTI.supportsScalableVectors() &&
5613                               !ForceTargetSupportsScalableVectors;
5614   if (IgnoreScalableUserVF) {
5615     LLVM_DEBUG(
5616         dbgs() << "LV: Ignoring VF=" << UserVF
5617                << " because target does not support scalable vectors.\n");
5618     ORE->emit([&]() {
5619       return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF",
5620                                         TheLoop->getStartLoc(),
5621                                         TheLoop->getHeader())
5622              << "Ignoring VF=" << ore::NV("UserVF", UserVF)
5623              << " because target does not support scalable vectors.";
5624     });
5625   }
5626 
5627   // Beyond this point two scenarios are handled. If UserVF isn't specified
5628   // then a suitable VF is chosen. If UserVF is specified and there are
5629   // dependencies, check if it's legal. However, if a UserVF is specified and
5630   // there are no dependencies, then there's nothing to do.
5631   if (UserVF.isNonZero() && !IgnoreScalableUserVF &&
5632       Legal->isSafeForAnyVectorWidth())
5633     return UserVF;
5634 
5635   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5636   unsigned SmallestType, WidestType;
5637   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5638   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5639 
5640   // Get the maximum safe dependence distance in bits computed by LAA.
5641   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5642   // the memory accesses that is most restrictive (involved in the smallest
5643   // dependence distance).
5644   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5645 
5646   // If the user vectorization factor is legally unsafe, clamp it to a safe
5647   // value. Otherwise, return as is.
5648   if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
5649     unsigned MaxSafeElements =
5650         PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5651     ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
5652 
5653     if (UserVF.isScalable()) {
5654       Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5655 
5656       // Scale VF by vscale before checking if it's safe.
5657       MaxSafeVF = ElementCount::getScalable(
5658           MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5659 
5660       if (MaxSafeVF.isZero()) {
5661         // The dependence distance is too small to use scalable vectors,
5662         // fallback on fixed.
5663         LLVM_DEBUG(
5664             dbgs()
5665             << "LV: Max legal vector width too small, scalable vectorization "
5666                "unfeasible. Using fixed-width vectorization instead.\n");
5667         ORE->emit([&]() {
5668           return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
5669                                             TheLoop->getStartLoc(),
5670                                             TheLoop->getHeader())
5671                  << "Max legal vector width too small, scalable vectorization "
5672                  << "unfeasible. Using fixed-width vectorization instead.";
5673         });
5674         return computeFeasibleMaxVF(
5675             ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
5676       }
5677     }
5678 
5679     LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
5680 
5681     if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
5682       return UserVF;
5683 
5684     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5685                       << " is unsafe, clamping to max safe VF=" << MaxSafeVF
5686                       << ".\n");
5687     ORE->emit([&]() {
5688       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5689                                         TheLoop->getStartLoc(),
5690                                         TheLoop->getHeader())
5691              << "User-specified vectorization factor "
5692              << ore::NV("UserVectorizationFactor", UserVF)
5693              << " is unsafe, clamping to maximum safe vectorization factor "
5694              << ore::NV("VectorizationFactor", MaxSafeVF);
5695     });
5696     return MaxSafeVF;
5697   }
5698 
5699   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5700 
5701   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5702   // Note that both WidestRegister and WidestType may not be a powers of 2.
5703   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5704 
5705   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5706                     << " / " << WidestType << " bits.\n");
5707   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5708                     << WidestRegister << " bits.\n");
5709 
5710   assert(MaxVectorSize <= WidestRegister &&
5711          "Did not expect to pack so many elements"
5712          " into one vector!");
5713   if (MaxVectorSize == 0) {
5714     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5715     MaxVectorSize = 1;
5716     return ElementCount::getFixed(MaxVectorSize);
5717   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5718              isPowerOf2_32(ConstTripCount)) {
5719     // We need to clamp the VF to be the ConstTripCount. There is no point in
5720     // choosing a higher viable VF as done in the loop below.
5721     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5722                       << ConstTripCount << "\n");
5723     MaxVectorSize = ConstTripCount;
5724     return ElementCount::getFixed(MaxVectorSize);
5725   }
5726 
5727   unsigned MaxVF = MaxVectorSize;
5728   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5729       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5730     // Collect all viable vectorization factors larger than the default MaxVF
5731     // (i.e. MaxVectorSize).
5732     SmallVector<ElementCount, 8> VFs;
5733     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5734     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5735       VFs.push_back(ElementCount::getFixed(VS));
5736 
5737     // For each VF calculate its register usage.
5738     auto RUs = calculateRegisterUsage(VFs);
5739 
5740     // Select the largest VF which doesn't require more registers than existing
5741     // ones.
5742     for (int i = RUs.size() - 1; i >= 0; --i) {
5743       bool Selected = true;
5744       for (auto& pair : RUs[i].MaxLocalUsers) {
5745         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5746         if (pair.second > TargetNumRegisters)
5747           Selected = false;
5748       }
5749       if (Selected) {
5750         MaxVF = VFs[i].getKnownMinValue();
5751         break;
5752       }
5753     }
5754     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5755       if (MaxVF < MinVF) {
5756         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5757                           << ") with target's minimum: " << MinVF << '\n');
5758         MaxVF = MinVF;
5759       }
5760     }
5761   }
5762   return ElementCount::getFixed(MaxVF);
5763 }
5764 
5765 VectorizationFactor
5766 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5767   // FIXME: This can be fixed for scalable vectors later, because at this stage
5768   // the LoopVectorizer will only consider vectorizing a loop with scalable
5769   // vectors when the loop has a hint to enable vectorization for a given VF.
5770   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5771 
5772   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5773   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5774   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5775 
5776   unsigned Width = 1;
5777   const float ScalarCost = *ExpectedCost.getValue();
5778   float Cost = ScalarCost;
5779 
5780   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5781   if (ForceVectorization && MaxVF.isVector()) {
5782     // Ignore scalar width, because the user explicitly wants vectorization.
5783     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5784     // evaluation.
5785     Cost = std::numeric_limits<float>::max();
5786   }
5787 
5788   for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
5789     // Notice that the vector loop needs to be executed less times, so
5790     // we need to divide the cost of the vector loops by the width of
5791     // the vector elements.
5792     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5793     assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
5794     float VectorCost = *C.first.getValue() / (float)i;
5795     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5796                       << " costs: " << (int)VectorCost << ".\n");
5797     if (!C.second && !ForceVectorization) {
5798       LLVM_DEBUG(
5799           dbgs() << "LV: Not considering vector loop of width " << i
5800                  << " because it will not generate any vector instructions.\n");
5801       continue;
5802     }
5803 
5804     // If profitable add it to ProfitableVF list.
5805     if (VectorCost < ScalarCost) {
5806       ProfitableVFs.push_back(VectorizationFactor(
5807           {ElementCount::getFixed(i), (unsigned)VectorCost}));
5808     }
5809 
5810     if (VectorCost < Cost) {
5811       Cost = VectorCost;
5812       Width = i;
5813     }
5814   }
5815 
5816   if (!EnableCondStoresVectorization && NumPredStores) {
5817     reportVectorizationFailure("There are conditional stores.",
5818         "store that is conditionally executed prevents vectorization",
5819         "ConditionalStore", ORE, TheLoop);
5820     Width = 1;
5821     Cost = ScalarCost;
5822   }
5823 
5824   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5825              << "LV: Vectorization seems to be not beneficial, "
5826              << "but was forced by a user.\n");
5827   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5828   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5829                                 (unsigned)(Width * Cost)};
5830   return Factor;
5831 }
5832 
5833 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5834     const Loop &L, ElementCount VF) const {
5835   // Cross iteration phis such as reductions need special handling and are
5836   // currently unsupported.
5837   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5838         return Legal->isFirstOrderRecurrence(&Phi) ||
5839                Legal->isReductionVariable(&Phi);
5840       }))
5841     return false;
5842 
5843   // Phis with uses outside of the loop require special handling and are
5844   // currently unsupported.
5845   for (auto &Entry : Legal->getInductionVars()) {
5846     // Look for uses of the value of the induction at the last iteration.
5847     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5848     for (User *U : PostInc->users())
5849       if (!L.contains(cast<Instruction>(U)))
5850         return false;
5851     // Look for uses of penultimate value of the induction.
5852     for (User *U : Entry.first->users())
5853       if (!L.contains(cast<Instruction>(U)))
5854         return false;
5855   }
5856 
5857   // Induction variables that are widened require special handling that is
5858   // currently not supported.
5859   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5860         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5861                  this->isProfitableToScalarize(Entry.first, VF));
5862       }))
5863     return false;
5864 
5865   return true;
5866 }
5867 
5868 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5869     const ElementCount VF) const {
5870   // FIXME: We need a much better cost-model to take different parameters such
5871   // as register pressure, code size increase and cost of extra branches into
5872   // account. For now we apply a very crude heuristic and only consider loops
5873   // with vectorization factors larger than a certain value.
5874   // We also consider epilogue vectorization unprofitable for targets that don't
5875   // consider interleaving beneficial (eg. MVE).
5876   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5877     return false;
5878   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5879     return true;
5880   return false;
5881 }
5882 
5883 VectorizationFactor
5884 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5885     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5886   VectorizationFactor Result = VectorizationFactor::Disabled();
5887   if (!EnableEpilogueVectorization) {
5888     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5889     return Result;
5890   }
5891 
5892   if (!isScalarEpilogueAllowed()) {
5893     LLVM_DEBUG(
5894         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5895                   "allowed.\n";);
5896     return Result;
5897   }
5898 
5899   // FIXME: This can be fixed for scalable vectors later, because at this stage
5900   // the LoopVectorizer will only consider vectorizing a loop with scalable
5901   // vectors when the loop has a hint to enable vectorization for a given VF.
5902   if (MainLoopVF.isScalable()) {
5903     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
5904                          "yet supported.\n");
5905     return Result;
5906   }
5907 
5908   // Not really a cost consideration, but check for unsupported cases here to
5909   // simplify the logic.
5910   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5911     LLVM_DEBUG(
5912         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5913                   "not a supported candidate.\n";);
5914     return Result;
5915   }
5916 
5917   if (EpilogueVectorizationForceVF > 1) {
5918     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5919     if (LVP.hasPlanWithVFs(
5920             {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
5921       return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
5922     else {
5923       LLVM_DEBUG(
5924           dbgs()
5925               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5926       return Result;
5927     }
5928   }
5929 
5930   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5931       TheLoop->getHeader()->getParent()->hasMinSize()) {
5932     LLVM_DEBUG(
5933         dbgs()
5934             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5935     return Result;
5936   }
5937 
5938   if (!isEpilogueVectorizationProfitable(MainLoopVF))
5939     return Result;
5940 
5941   for (auto &NextVF : ProfitableVFs)
5942     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
5943         (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
5944         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
5945       Result = NextVF;
5946 
5947   if (Result != VectorizationFactor::Disabled())
5948     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5949                       << Result.Width.getFixedValue() << "\n";);
5950   return Result;
5951 }
5952 
5953 std::pair<unsigned, unsigned>
5954 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5955   unsigned MinWidth = -1U;
5956   unsigned MaxWidth = 8;
5957   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5958 
5959   // For each block.
5960   for (BasicBlock *BB : TheLoop->blocks()) {
5961     // For each instruction in the loop.
5962     for (Instruction &I : BB->instructionsWithoutDebug()) {
5963       Type *T = I.getType();
5964 
5965       // Skip ignored values.
5966       if (ValuesToIgnore.count(&I))
5967         continue;
5968 
5969       // Only examine Loads, Stores and PHINodes.
5970       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5971         continue;
5972 
5973       // Examine PHI nodes that are reduction variables. Update the type to
5974       // account for the recurrence type.
5975       if (auto *PN = dyn_cast<PHINode>(&I)) {
5976         if (!Legal->isReductionVariable(PN))
5977           continue;
5978         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5979         T = RdxDesc.getRecurrenceType();
5980       }
5981 
5982       // Examine the stored values.
5983       if (auto *ST = dyn_cast<StoreInst>(&I))
5984         T = ST->getValueOperand()->getType();
5985 
5986       // Ignore loaded pointer types and stored pointer types that are not
5987       // vectorizable.
5988       //
5989       // FIXME: The check here attempts to predict whether a load or store will
5990       //        be vectorized. We only know this for certain after a VF has
5991       //        been selected. Here, we assume that if an access can be
5992       //        vectorized, it will be. We should also look at extending this
5993       //        optimization to non-pointer types.
5994       //
5995       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5996           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5997         continue;
5998 
5999       MinWidth = std::min(MinWidth,
6000                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6001       MaxWidth = std::max(MaxWidth,
6002                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6003     }
6004   }
6005 
6006   return {MinWidth, MaxWidth};
6007 }
6008 
6009 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6010                                                            unsigned LoopCost) {
6011   // -- The interleave heuristics --
6012   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6013   // There are many micro-architectural considerations that we can't predict
6014   // at this level. For example, frontend pressure (on decode or fetch) due to
6015   // code size, or the number and capabilities of the execution ports.
6016   //
6017   // We use the following heuristics to select the interleave count:
6018   // 1. If the code has reductions, then we interleave to break the cross
6019   // iteration dependency.
6020   // 2. If the loop is really small, then we interleave to reduce the loop
6021   // overhead.
6022   // 3. We don't interleave if we think that we will spill registers to memory
6023   // due to the increased register pressure.
6024 
6025   if (!isScalarEpilogueAllowed())
6026     return 1;
6027 
6028   // We used the distance for the interleave count.
6029   if (Legal->getMaxSafeDepDistBytes() != -1U)
6030     return 1;
6031 
6032   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6033   const bool HasReductions = !Legal->getReductionVars().empty();
6034   // Do not interleave loops with a relatively small known or estimated trip
6035   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6036   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6037   // because with the above conditions interleaving can expose ILP and break
6038   // cross iteration dependences for reductions.
6039   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6040       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6041     return 1;
6042 
6043   RegisterUsage R = calculateRegisterUsage({VF})[0];
6044   // We divide by these constants so assume that we have at least one
6045   // instruction that uses at least one register.
6046   for (auto& pair : R.MaxLocalUsers) {
6047     pair.second = std::max(pair.second, 1U);
6048   }
6049 
6050   // We calculate the interleave count using the following formula.
6051   // Subtract the number of loop invariants from the number of available
6052   // registers. These registers are used by all of the interleaved instances.
6053   // Next, divide the remaining registers by the number of registers that is
6054   // required by the loop, in order to estimate how many parallel instances
6055   // fit without causing spills. All of this is rounded down if necessary to be
6056   // a power of two. We want power of two interleave count to simplify any
6057   // addressing operations or alignment considerations.
6058   // We also want power of two interleave counts to ensure that the induction
6059   // variable of the vector loop wraps to zero, when tail is folded by masking;
6060   // this currently happens when OptForSize, in which case IC is set to 1 above.
6061   unsigned IC = UINT_MAX;
6062 
6063   for (auto& pair : R.MaxLocalUsers) {
6064     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6065     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6066                       << " registers of "
6067                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6068     if (VF.isScalar()) {
6069       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6070         TargetNumRegisters = ForceTargetNumScalarRegs;
6071     } else {
6072       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6073         TargetNumRegisters = ForceTargetNumVectorRegs;
6074     }
6075     unsigned MaxLocalUsers = pair.second;
6076     unsigned LoopInvariantRegs = 0;
6077     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6078       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6079 
6080     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6081     // Don't count the induction variable as interleaved.
6082     if (EnableIndVarRegisterHeur) {
6083       TmpIC =
6084           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6085                         std::max(1U, (MaxLocalUsers - 1)));
6086     }
6087 
6088     IC = std::min(IC, TmpIC);
6089   }
6090 
6091   // Clamp the interleave ranges to reasonable counts.
6092   unsigned MaxInterleaveCount =
6093       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6094 
6095   // Check if the user has overridden the max.
6096   if (VF.isScalar()) {
6097     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6098       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6099   } else {
6100     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6101       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6102   }
6103 
6104   // If trip count is known or estimated compile time constant, limit the
6105   // interleave count to be less than the trip count divided by VF, provided it
6106   // is at least 1.
6107   //
6108   // For scalable vectors we can't know if interleaving is beneficial. It may
6109   // not be beneficial for small loops if none of the lanes in the second vector
6110   // iterations is enabled. However, for larger loops, there is likely to be a
6111   // similar benefit as for fixed-width vectors. For now, we choose to leave
6112   // the InterleaveCount as if vscale is '1', although if some information about
6113   // the vector is known (e.g. min vector size), we can make a better decision.
6114   if (BestKnownTC) {
6115     MaxInterleaveCount =
6116         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6117     // Make sure MaxInterleaveCount is greater than 0.
6118     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6119   }
6120 
6121   assert(MaxInterleaveCount > 0 &&
6122          "Maximum interleave count must be greater than 0");
6123 
6124   // Clamp the calculated IC to be between the 1 and the max interleave count
6125   // that the target and trip count allows.
6126   if (IC > MaxInterleaveCount)
6127     IC = MaxInterleaveCount;
6128   else
6129     // Make sure IC is greater than 0.
6130     IC = std::max(1u, IC);
6131 
6132   assert(IC > 0 && "Interleave count must be greater than 0.");
6133 
6134   // If we did not calculate the cost for VF (because the user selected the VF)
6135   // then we calculate the cost of VF here.
6136   if (LoopCost == 0) {
6137     assert(expectedCost(VF).first.isValid() && "Expected a valid cost");
6138     LoopCost = *expectedCost(VF).first.getValue();
6139   }
6140 
6141   assert(LoopCost && "Non-zero loop cost expected");
6142 
6143   // Interleave if we vectorized this loop and there is a reduction that could
6144   // benefit from interleaving.
6145   if (VF.isVector() && HasReductions) {
6146     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6147     return IC;
6148   }
6149 
6150   // Note that if we've already vectorized the loop we will have done the
6151   // runtime check and so interleaving won't require further checks.
6152   bool InterleavingRequiresRuntimePointerCheck =
6153       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6154 
6155   // We want to interleave small loops in order to reduce the loop overhead and
6156   // potentially expose ILP opportunities.
6157   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6158                     << "LV: IC is " << IC << '\n'
6159                     << "LV: VF is " << VF << '\n');
6160   const bool AggressivelyInterleaveReductions =
6161       TTI.enableAggressiveInterleaving(HasReductions);
6162   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6163     // We assume that the cost overhead is 1 and we use the cost model
6164     // to estimate the cost of the loop and interleave until the cost of the
6165     // loop overhead is about 5% of the cost of the loop.
6166     unsigned SmallIC =
6167         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6168 
6169     // Interleave until store/load ports (estimated by max interleave count) are
6170     // saturated.
6171     unsigned NumStores = Legal->getNumStores();
6172     unsigned NumLoads = Legal->getNumLoads();
6173     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6174     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6175 
6176     // If we have a scalar reduction (vector reductions are already dealt with
6177     // by this point), we can increase the critical path length if the loop
6178     // we're interleaving is inside another loop. Limit, by default to 2, so the
6179     // critical path only gets increased by one reduction operation.
6180     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6181       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6182       SmallIC = std::min(SmallIC, F);
6183       StoresIC = std::min(StoresIC, F);
6184       LoadsIC = std::min(LoadsIC, F);
6185     }
6186 
6187     if (EnableLoadStoreRuntimeInterleave &&
6188         std::max(StoresIC, LoadsIC) > SmallIC) {
6189       LLVM_DEBUG(
6190           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6191       return std::max(StoresIC, LoadsIC);
6192     }
6193 
6194     // If there are scalar reductions and TTI has enabled aggressive
6195     // interleaving for reductions, we will interleave to expose ILP.
6196     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6197         AggressivelyInterleaveReductions) {
6198       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6199       // Interleave no less than SmallIC but not as aggressive as the normal IC
6200       // to satisfy the rare situation when resources are too limited.
6201       return std::max(IC / 2, SmallIC);
6202     } else {
6203       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6204       return SmallIC;
6205     }
6206   }
6207 
6208   // Interleave if this is a large loop (small loops are already dealt with by
6209   // this point) that could benefit from interleaving.
6210   if (AggressivelyInterleaveReductions) {
6211     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6212     return IC;
6213   }
6214 
6215   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6216   return 1;
6217 }
6218 
6219 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6220 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6221   // This function calculates the register usage by measuring the highest number
6222   // of values that are alive at a single location. Obviously, this is a very
6223   // rough estimation. We scan the loop in a topological order in order and
6224   // assign a number to each instruction. We use RPO to ensure that defs are
6225   // met before their users. We assume that each instruction that has in-loop
6226   // users starts an interval. We record every time that an in-loop value is
6227   // used, so we have a list of the first and last occurrences of each
6228   // instruction. Next, we transpose this data structure into a multi map that
6229   // holds the list of intervals that *end* at a specific location. This multi
6230   // map allows us to perform a linear search. We scan the instructions linearly
6231   // and record each time that a new interval starts, by placing it in a set.
6232   // If we find this value in the multi-map then we remove it from the set.
6233   // The max register usage is the maximum size of the set.
6234   // We also search for instructions that are defined outside the loop, but are
6235   // used inside the loop. We need this number separately from the max-interval
6236   // usage number because when we unroll, loop-invariant values do not take
6237   // more register.
6238   LoopBlocksDFS DFS(TheLoop);
6239   DFS.perform(LI);
6240 
6241   RegisterUsage RU;
6242 
6243   // Each 'key' in the map opens a new interval. The values
6244   // of the map are the index of the 'last seen' usage of the
6245   // instruction that is the key.
6246   using IntervalMap = DenseMap<Instruction *, unsigned>;
6247 
6248   // Maps instruction to its index.
6249   SmallVector<Instruction *, 64> IdxToInstr;
6250   // Marks the end of each interval.
6251   IntervalMap EndPoint;
6252   // Saves the list of instruction indices that are used in the loop.
6253   SmallPtrSet<Instruction *, 8> Ends;
6254   // Saves the list of values that are used in the loop but are
6255   // defined outside the loop, such as arguments and constants.
6256   SmallPtrSet<Value *, 8> LoopInvariants;
6257 
6258   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6259     for (Instruction &I : BB->instructionsWithoutDebug()) {
6260       IdxToInstr.push_back(&I);
6261 
6262       // Save the end location of each USE.
6263       for (Value *U : I.operands()) {
6264         auto *Instr = dyn_cast<Instruction>(U);
6265 
6266         // Ignore non-instruction values such as arguments, constants, etc.
6267         if (!Instr)
6268           continue;
6269 
6270         // If this instruction is outside the loop then record it and continue.
6271         if (!TheLoop->contains(Instr)) {
6272           LoopInvariants.insert(Instr);
6273           continue;
6274         }
6275 
6276         // Overwrite previous end points.
6277         EndPoint[Instr] = IdxToInstr.size();
6278         Ends.insert(Instr);
6279       }
6280     }
6281   }
6282 
6283   // Saves the list of intervals that end with the index in 'key'.
6284   using InstrList = SmallVector<Instruction *, 2>;
6285   DenseMap<unsigned, InstrList> TransposeEnds;
6286 
6287   // Transpose the EndPoints to a list of values that end at each index.
6288   for (auto &Interval : EndPoint)
6289     TransposeEnds[Interval.second].push_back(Interval.first);
6290 
6291   SmallPtrSet<Instruction *, 8> OpenIntervals;
6292   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6293   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6294 
6295   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6296 
6297   // A lambda that gets the register usage for the given type and VF.
6298   const auto &TTICapture = TTI;
6299   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6300     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6301       return 0U;
6302     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6303   };
6304 
6305   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6306     Instruction *I = IdxToInstr[i];
6307 
6308     // Remove all of the instructions that end at this location.
6309     InstrList &List = TransposeEnds[i];
6310     for (Instruction *ToRemove : List)
6311       OpenIntervals.erase(ToRemove);
6312 
6313     // Ignore instructions that are never used within the loop.
6314     if (!Ends.count(I))
6315       continue;
6316 
6317     // Skip ignored values.
6318     if (ValuesToIgnore.count(I))
6319       continue;
6320 
6321     // For each VF find the maximum usage of registers.
6322     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6323       // Count the number of live intervals.
6324       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6325 
6326       if (VFs[j].isScalar()) {
6327         for (auto Inst : OpenIntervals) {
6328           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6329           if (RegUsage.find(ClassID) == RegUsage.end())
6330             RegUsage[ClassID] = 1;
6331           else
6332             RegUsage[ClassID] += 1;
6333         }
6334       } else {
6335         collectUniformsAndScalars(VFs[j]);
6336         for (auto Inst : OpenIntervals) {
6337           // Skip ignored values for VF > 1.
6338           if (VecValuesToIgnore.count(Inst))
6339             continue;
6340           if (isScalarAfterVectorization(Inst, VFs[j])) {
6341             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6342             if (RegUsage.find(ClassID) == RegUsage.end())
6343               RegUsage[ClassID] = 1;
6344             else
6345               RegUsage[ClassID] += 1;
6346           } else {
6347             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6348             if (RegUsage.find(ClassID) == RegUsage.end())
6349               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6350             else
6351               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6352           }
6353         }
6354       }
6355 
6356       for (auto& pair : RegUsage) {
6357         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6358           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6359         else
6360           MaxUsages[j][pair.first] = pair.second;
6361       }
6362     }
6363 
6364     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6365                       << OpenIntervals.size() << '\n');
6366 
6367     // Add the current instruction to the list of open intervals.
6368     OpenIntervals.insert(I);
6369   }
6370 
6371   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6372     SmallMapVector<unsigned, unsigned, 4> Invariant;
6373 
6374     for (auto Inst : LoopInvariants) {
6375       unsigned Usage =
6376           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6377       unsigned ClassID =
6378           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6379       if (Invariant.find(ClassID) == Invariant.end())
6380         Invariant[ClassID] = Usage;
6381       else
6382         Invariant[ClassID] += Usage;
6383     }
6384 
6385     LLVM_DEBUG({
6386       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6387       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6388              << " item\n";
6389       for (const auto &pair : MaxUsages[i]) {
6390         dbgs() << "LV(REG): RegisterClass: "
6391                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6392                << " registers\n";
6393       }
6394       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6395              << " item\n";
6396       for (const auto &pair : Invariant) {
6397         dbgs() << "LV(REG): RegisterClass: "
6398                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6399                << " registers\n";
6400       }
6401     });
6402 
6403     RU.LoopInvariantRegs = Invariant;
6404     RU.MaxLocalUsers = MaxUsages[i];
6405     RUs[i] = RU;
6406   }
6407 
6408   return RUs;
6409 }
6410 
6411 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6412   // TODO: Cost model for emulated masked load/store is completely
6413   // broken. This hack guides the cost model to use an artificially
6414   // high enough value to practically disable vectorization with such
6415   // operations, except where previously deployed legality hack allowed
6416   // using very low cost values. This is to avoid regressions coming simply
6417   // from moving "masked load/store" check from legality to cost model.
6418   // Masked Load/Gather emulation was previously never allowed.
6419   // Limited number of Masked Store/Scatter emulation was allowed.
6420   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
6421   return isa<LoadInst>(I) ||
6422          (isa<StoreInst>(I) &&
6423           NumPredStores > NumberOfStoresToPredicate);
6424 }
6425 
6426 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6427   // If we aren't vectorizing the loop, or if we've already collected the
6428   // instructions to scalarize, there's nothing to do. Collection may already
6429   // have occurred if we have a user-selected VF and are now computing the
6430   // expected cost for interleaving.
6431   if (VF.isScalar() || VF.isZero() ||
6432       InstsToScalarize.find(VF) != InstsToScalarize.end())
6433     return;
6434 
6435   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6436   // not profitable to scalarize any instructions, the presence of VF in the
6437   // map will indicate that we've analyzed it already.
6438   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6439 
6440   // Find all the instructions that are scalar with predication in the loop and
6441   // determine if it would be better to not if-convert the blocks they are in.
6442   // If so, we also record the instructions to scalarize.
6443   for (BasicBlock *BB : TheLoop->blocks()) {
6444     if (!blockNeedsPredication(BB))
6445       continue;
6446     for (Instruction &I : *BB)
6447       if (isScalarWithPredication(&I)) {
6448         ScalarCostsTy ScalarCosts;
6449         // Do not apply discount logic if hacked cost is needed
6450         // for emulated masked memrefs.
6451         if (!useEmulatedMaskMemRefHack(&I) &&
6452             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6453           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6454         // Remember that BB will remain after vectorization.
6455         PredicatedBBsAfterVectorization.insert(BB);
6456       }
6457   }
6458 }
6459 
6460 int LoopVectorizationCostModel::computePredInstDiscount(
6461     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6462   assert(!isUniformAfterVectorization(PredInst, VF) &&
6463          "Instruction marked uniform-after-vectorization will be predicated");
6464 
6465   // Initialize the discount to zero, meaning that the scalar version and the
6466   // vector version cost the same.
6467   InstructionCost Discount = 0;
6468 
6469   // Holds instructions to analyze. The instructions we visit are mapped in
6470   // ScalarCosts. Those instructions are the ones that would be scalarized if
6471   // we find that the scalar version costs less.
6472   SmallVector<Instruction *, 8> Worklist;
6473 
6474   // Returns true if the given instruction can be scalarized.
6475   auto canBeScalarized = [&](Instruction *I) -> bool {
6476     // We only attempt to scalarize instructions forming a single-use chain
6477     // from the original predicated block that would otherwise be vectorized.
6478     // Although not strictly necessary, we give up on instructions we know will
6479     // already be scalar to avoid traversing chains that are unlikely to be
6480     // beneficial.
6481     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6482         isScalarAfterVectorization(I, VF))
6483       return false;
6484 
6485     // If the instruction is scalar with predication, it will be analyzed
6486     // separately. We ignore it within the context of PredInst.
6487     if (isScalarWithPredication(I))
6488       return false;
6489 
6490     // If any of the instruction's operands are uniform after vectorization,
6491     // the instruction cannot be scalarized. This prevents, for example, a
6492     // masked load from being scalarized.
6493     //
6494     // We assume we will only emit a value for lane zero of an instruction
6495     // marked uniform after vectorization, rather than VF identical values.
6496     // Thus, if we scalarize an instruction that uses a uniform, we would
6497     // create uses of values corresponding to the lanes we aren't emitting code
6498     // for. This behavior can be changed by allowing getScalarValue to clone
6499     // the lane zero values for uniforms rather than asserting.
6500     for (Use &U : I->operands())
6501       if (auto *J = dyn_cast<Instruction>(U.get()))
6502         if (isUniformAfterVectorization(J, VF))
6503           return false;
6504 
6505     // Otherwise, we can scalarize the instruction.
6506     return true;
6507   };
6508 
6509   // Compute the expected cost discount from scalarizing the entire expression
6510   // feeding the predicated instruction. We currently only consider expressions
6511   // that are single-use instruction chains.
6512   Worklist.push_back(PredInst);
6513   while (!Worklist.empty()) {
6514     Instruction *I = Worklist.pop_back_val();
6515 
6516     // If we've already analyzed the instruction, there's nothing to do.
6517     if (ScalarCosts.find(I) != ScalarCosts.end())
6518       continue;
6519 
6520     // Compute the cost of the vector instruction. Note that this cost already
6521     // includes the scalarization overhead of the predicated instruction.
6522     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6523 
6524     // Compute the cost of the scalarized instruction. This cost is the cost of
6525     // the instruction as if it wasn't if-converted and instead remained in the
6526     // predicated block. We will scale this cost by block probability after
6527     // computing the scalarization overhead.
6528     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6529     InstructionCost ScalarCost =
6530         VF.getKnownMinValue() *
6531         getInstructionCost(I, ElementCount::getFixed(1)).first;
6532 
6533     // Compute the scalarization overhead of needed insertelement instructions
6534     // and phi nodes.
6535     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6536       ScalarCost += TTI.getScalarizationOverhead(
6537           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6538           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6539       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6540       ScalarCost +=
6541           VF.getKnownMinValue() *
6542           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6543     }
6544 
6545     // Compute the scalarization overhead of needed extractelement
6546     // instructions. For each of the instruction's operands, if the operand can
6547     // be scalarized, add it to the worklist; otherwise, account for the
6548     // overhead.
6549     for (Use &U : I->operands())
6550       if (auto *J = dyn_cast<Instruction>(U.get())) {
6551         assert(VectorType::isValidElementType(J->getType()) &&
6552                "Instruction has non-scalar type");
6553         if (canBeScalarized(J))
6554           Worklist.push_back(J);
6555         else if (needsExtract(J, VF)) {
6556           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6557           ScalarCost += TTI.getScalarizationOverhead(
6558               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6559               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6560         }
6561       }
6562 
6563     // Scale the total scalar cost by block probability.
6564     ScalarCost /= getReciprocalPredBlockProb();
6565 
6566     // Compute the discount. A non-negative discount means the vector version
6567     // of the instruction costs more, and scalarizing would be beneficial.
6568     Discount += VectorCost - ScalarCost;
6569     ScalarCosts[I] = ScalarCost;
6570   }
6571 
6572   return *Discount.getValue();
6573 }
6574 
6575 LoopVectorizationCostModel::VectorizationCostTy
6576 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6577   VectorizationCostTy Cost;
6578 
6579   // For each block.
6580   for (BasicBlock *BB : TheLoop->blocks()) {
6581     VectorizationCostTy BlockCost;
6582 
6583     // For each instruction in the old loop.
6584     for (Instruction &I : BB->instructionsWithoutDebug()) {
6585       // Skip ignored values.
6586       if (ValuesToIgnore.count(&I) ||
6587           (VF.isVector() && VecValuesToIgnore.count(&I)))
6588         continue;
6589 
6590       VectorizationCostTy C = getInstructionCost(&I, VF);
6591 
6592       // Check if we should override the cost.
6593       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6594         C.first = InstructionCost(ForceTargetInstructionCost);
6595 
6596       BlockCost.first += C.first;
6597       BlockCost.second |= C.second;
6598       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6599                         << " for VF " << VF << " For instruction: " << I
6600                         << '\n');
6601     }
6602 
6603     // If we are vectorizing a predicated block, it will have been
6604     // if-converted. This means that the block's instructions (aside from
6605     // stores and instructions that may divide by zero) will now be
6606     // unconditionally executed. For the scalar case, we may not always execute
6607     // the predicated block, if it is an if-else block. Thus, scale the block's
6608     // cost by the probability of executing it. blockNeedsPredication from
6609     // Legal is used so as to not include all blocks in tail folded loops.
6610     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6611       BlockCost.first /= getReciprocalPredBlockProb();
6612 
6613     Cost.first += BlockCost.first;
6614     Cost.second |= BlockCost.second;
6615   }
6616 
6617   return Cost;
6618 }
6619 
6620 /// Gets Address Access SCEV after verifying that the access pattern
6621 /// is loop invariant except the induction variable dependence.
6622 ///
6623 /// This SCEV can be sent to the Target in order to estimate the address
6624 /// calculation cost.
6625 static const SCEV *getAddressAccessSCEV(
6626               Value *Ptr,
6627               LoopVectorizationLegality *Legal,
6628               PredicatedScalarEvolution &PSE,
6629               const Loop *TheLoop) {
6630 
6631   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6632   if (!Gep)
6633     return nullptr;
6634 
6635   // We are looking for a gep with all loop invariant indices except for one
6636   // which should be an induction variable.
6637   auto SE = PSE.getSE();
6638   unsigned NumOperands = Gep->getNumOperands();
6639   for (unsigned i = 1; i < NumOperands; ++i) {
6640     Value *Opd = Gep->getOperand(i);
6641     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6642         !Legal->isInductionVariable(Opd))
6643       return nullptr;
6644   }
6645 
6646   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6647   return PSE.getSCEV(Ptr);
6648 }
6649 
6650 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6651   return Legal->hasStride(I->getOperand(0)) ||
6652          Legal->hasStride(I->getOperand(1));
6653 }
6654 
6655 InstructionCost
6656 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6657                                                         ElementCount VF) {
6658   assert(VF.isVector() &&
6659          "Scalarization cost of instruction implies vectorization.");
6660   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6661   Type *ValTy = getMemInstValueType(I);
6662   auto SE = PSE.getSE();
6663 
6664   unsigned AS = getLoadStoreAddressSpace(I);
6665   Value *Ptr = getLoadStorePointerOperand(I);
6666   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6667 
6668   // Figure out whether the access is strided and get the stride value
6669   // if it's known in compile time
6670   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6671 
6672   // Get the cost of the scalar memory instruction and address computation.
6673   InstructionCost Cost =
6674       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6675 
6676   // Don't pass *I here, since it is scalar but will actually be part of a
6677   // vectorized loop where the user of it is a vectorized instruction.
6678   const Align Alignment = getLoadStoreAlignment(I);
6679   Cost += VF.getKnownMinValue() *
6680           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6681                               AS, TTI::TCK_RecipThroughput);
6682 
6683   // Get the overhead of the extractelement and insertelement instructions
6684   // we might create due to scalarization.
6685   Cost += getScalarizationOverhead(I, VF);
6686 
6687   // If we have a predicated store, it may not be executed for each vector
6688   // lane. Scale the cost by the probability of executing the predicated
6689   // block.
6690   if (isPredicatedInst(I)) {
6691     Cost /= getReciprocalPredBlockProb();
6692 
6693     if (useEmulatedMaskMemRefHack(I))
6694       // Artificially setting to a high enough value to practically disable
6695       // vectorization with such operations.
6696       Cost = 3000000;
6697   }
6698 
6699   return Cost;
6700 }
6701 
6702 InstructionCost
6703 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6704                                                     ElementCount VF) {
6705   Type *ValTy = getMemInstValueType(I);
6706   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6707   Value *Ptr = getLoadStorePointerOperand(I);
6708   unsigned AS = getLoadStoreAddressSpace(I);
6709   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6710   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6711 
6712   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6713          "Stride should be 1 or -1 for consecutive memory access");
6714   const Align Alignment = getLoadStoreAlignment(I);
6715   InstructionCost Cost = 0;
6716   if (Legal->isMaskRequired(I))
6717     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6718                                       CostKind);
6719   else
6720     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6721                                 CostKind, I);
6722 
6723   bool Reverse = ConsecutiveStride < 0;
6724   if (Reverse)
6725     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6726   return Cost;
6727 }
6728 
6729 InstructionCost
6730 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6731                                                 ElementCount VF) {
6732   assert(Legal->isUniformMemOp(*I));
6733 
6734   Type *ValTy = getMemInstValueType(I);
6735   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6736   const Align Alignment = getLoadStoreAlignment(I);
6737   unsigned AS = getLoadStoreAddressSpace(I);
6738   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6739   if (isa<LoadInst>(I)) {
6740     return TTI.getAddressComputationCost(ValTy) +
6741            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6742                                CostKind) +
6743            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6744   }
6745   StoreInst *SI = cast<StoreInst>(I);
6746 
6747   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6748   return TTI.getAddressComputationCost(ValTy) +
6749          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6750                              CostKind) +
6751          (isLoopInvariantStoreValue
6752               ? 0
6753               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6754                                        VF.getKnownMinValue() - 1));
6755 }
6756 
6757 InstructionCost
6758 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6759                                                  ElementCount VF) {
6760   Type *ValTy = getMemInstValueType(I);
6761   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6762   const Align Alignment = getLoadStoreAlignment(I);
6763   const Value *Ptr = getLoadStorePointerOperand(I);
6764 
6765   return TTI.getAddressComputationCost(VectorTy) +
6766          TTI.getGatherScatterOpCost(
6767              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6768              TargetTransformInfo::TCK_RecipThroughput, I);
6769 }
6770 
6771 InstructionCost
6772 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6773                                                    ElementCount VF) {
6774   Type *ValTy = getMemInstValueType(I);
6775   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6776   unsigned AS = getLoadStoreAddressSpace(I);
6777 
6778   auto Group = getInterleavedAccessGroup(I);
6779   assert(Group && "Fail to get an interleaved access group.");
6780 
6781   unsigned InterleaveFactor = Group->getFactor();
6782   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6783   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6784 
6785   // Holds the indices of existing members in an interleaved load group.
6786   // An interleaved store group doesn't need this as it doesn't allow gaps.
6787   SmallVector<unsigned, 4> Indices;
6788   if (isa<LoadInst>(I)) {
6789     for (unsigned i = 0; i < InterleaveFactor; i++)
6790       if (Group->getMember(i))
6791         Indices.push_back(i);
6792   }
6793 
6794   // Calculate the cost of the whole interleaved group.
6795   bool UseMaskForGaps =
6796       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6797   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6798       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6799       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6800 
6801   if (Group->isReverse()) {
6802     // TODO: Add support for reversed masked interleaved access.
6803     assert(!Legal->isMaskRequired(I) &&
6804            "Reverse masked interleaved access not supported.");
6805     Cost += Group->getNumMembers() *
6806             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6807   }
6808   return Cost;
6809 }
6810 
6811 InstructionCost
6812 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6813                                                      ElementCount VF) {
6814   // Calculate scalar cost only. Vectorization cost should be ready at this
6815   // moment.
6816   if (VF.isScalar()) {
6817     Type *ValTy = getMemInstValueType(I);
6818     const Align Alignment = getLoadStoreAlignment(I);
6819     unsigned AS = getLoadStoreAddressSpace(I);
6820 
6821     return TTI.getAddressComputationCost(ValTy) +
6822            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6823                                TTI::TCK_RecipThroughput, I);
6824   }
6825   return getWideningCost(I, VF);
6826 }
6827 
6828 LoopVectorizationCostModel::VectorizationCostTy
6829 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6830                                                ElementCount VF) {
6831   // If we know that this instruction will remain uniform, check the cost of
6832   // the scalar version.
6833   if (isUniformAfterVectorization(I, VF))
6834     VF = ElementCount::getFixed(1);
6835 
6836   if (VF.isVector() && isProfitableToScalarize(I, VF))
6837     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6838 
6839   // Forced scalars do not have any scalarization overhead.
6840   auto ForcedScalar = ForcedScalars.find(VF);
6841   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6842     auto InstSet = ForcedScalar->second;
6843     if (InstSet.count(I))
6844       return VectorizationCostTy(
6845           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6846            VF.getKnownMinValue()),
6847           false);
6848   }
6849 
6850   Type *VectorTy;
6851   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6852 
6853   bool TypeNotScalarized =
6854       VF.isVector() && VectorTy->isVectorTy() &&
6855       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6856   return VectorizationCostTy(C, TypeNotScalarized);
6857 }
6858 
6859 InstructionCost
6860 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6861                                                      ElementCount VF) {
6862 
6863   assert(!VF.isScalable() &&
6864          "cannot compute scalarization overhead for scalable vectorization");
6865   if (VF.isScalar())
6866     return 0;
6867 
6868   InstructionCost Cost = 0;
6869   Type *RetTy = ToVectorTy(I->getType(), VF);
6870   if (!RetTy->isVoidTy() &&
6871       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6872     Cost += TTI.getScalarizationOverhead(
6873         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6874         true, false);
6875 
6876   // Some targets keep addresses scalar.
6877   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6878     return Cost;
6879 
6880   // Some targets support efficient element stores.
6881   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6882     return Cost;
6883 
6884   // Collect operands to consider.
6885   CallInst *CI = dyn_cast<CallInst>(I);
6886   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6887 
6888   // Skip operands that do not require extraction/scalarization and do not incur
6889   // any overhead.
6890   return Cost + TTI.getOperandsScalarizationOverhead(
6891                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6892 }
6893 
6894 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6895   if (VF.isScalar())
6896     return;
6897   NumPredStores = 0;
6898   for (BasicBlock *BB : TheLoop->blocks()) {
6899     // For each instruction in the old loop.
6900     for (Instruction &I : *BB) {
6901       Value *Ptr =  getLoadStorePointerOperand(&I);
6902       if (!Ptr)
6903         continue;
6904 
6905       // TODO: We should generate better code and update the cost model for
6906       // predicated uniform stores. Today they are treated as any other
6907       // predicated store (see added test cases in
6908       // invariant-store-vectorization.ll).
6909       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6910         NumPredStores++;
6911 
6912       if (Legal->isUniformMemOp(I)) {
6913         // TODO: Avoid replicating loads and stores instead of
6914         // relying on instcombine to remove them.
6915         // Load: Scalar load + broadcast
6916         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6917         InstructionCost Cost = getUniformMemOpCost(&I, VF);
6918         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6919         continue;
6920       }
6921 
6922       // We assume that widening is the best solution when possible.
6923       if (memoryInstructionCanBeWidened(&I, VF)) {
6924         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6925         int ConsecutiveStride =
6926                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6927         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6928                "Expected consecutive stride.");
6929         InstWidening Decision =
6930             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6931         setWideningDecision(&I, VF, Decision, Cost);
6932         continue;
6933       }
6934 
6935       // Choose between Interleaving, Gather/Scatter or Scalarization.
6936       InstructionCost InterleaveCost = std::numeric_limits<int>::max();
6937       unsigned NumAccesses = 1;
6938       if (isAccessInterleaved(&I)) {
6939         auto Group = getInterleavedAccessGroup(&I);
6940         assert(Group && "Fail to get an interleaved access group.");
6941 
6942         // Make one decision for the whole group.
6943         if (getWideningDecision(&I, VF) != CM_Unknown)
6944           continue;
6945 
6946         NumAccesses = Group->getNumMembers();
6947         if (interleavedAccessCanBeWidened(&I, VF))
6948           InterleaveCost = getInterleaveGroupCost(&I, VF);
6949       }
6950 
6951       InstructionCost GatherScatterCost =
6952           isLegalGatherOrScatter(&I)
6953               ? getGatherScatterCost(&I, VF) * NumAccesses
6954               : std::numeric_limits<int>::max();
6955 
6956       InstructionCost ScalarizationCost =
6957           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6958 
6959       // Choose better solution for the current VF,
6960       // write down this decision and use it during vectorization.
6961       InstructionCost Cost;
6962       InstWidening Decision;
6963       if (InterleaveCost <= GatherScatterCost &&
6964           InterleaveCost < ScalarizationCost) {
6965         Decision = CM_Interleave;
6966         Cost = InterleaveCost;
6967       } else if (GatherScatterCost < ScalarizationCost) {
6968         Decision = CM_GatherScatter;
6969         Cost = GatherScatterCost;
6970       } else {
6971         Decision = CM_Scalarize;
6972         Cost = ScalarizationCost;
6973       }
6974       // If the instructions belongs to an interleave group, the whole group
6975       // receives the same decision. The whole group receives the cost, but
6976       // the cost will actually be assigned to one instruction.
6977       if (auto Group = getInterleavedAccessGroup(&I))
6978         setWideningDecision(Group, VF, Decision, Cost);
6979       else
6980         setWideningDecision(&I, VF, Decision, Cost);
6981     }
6982   }
6983 
6984   // Make sure that any load of address and any other address computation
6985   // remains scalar unless there is gather/scatter support. This avoids
6986   // inevitable extracts into address registers, and also has the benefit of
6987   // activating LSR more, since that pass can't optimize vectorized
6988   // addresses.
6989   if (TTI.prefersVectorizedAddressing())
6990     return;
6991 
6992   // Start with all scalar pointer uses.
6993   SmallPtrSet<Instruction *, 8> AddrDefs;
6994   for (BasicBlock *BB : TheLoop->blocks())
6995     for (Instruction &I : *BB) {
6996       Instruction *PtrDef =
6997         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6998       if (PtrDef && TheLoop->contains(PtrDef) &&
6999           getWideningDecision(&I, VF) != CM_GatherScatter)
7000         AddrDefs.insert(PtrDef);
7001     }
7002 
7003   // Add all instructions used to generate the addresses.
7004   SmallVector<Instruction *, 4> Worklist;
7005   for (auto *I : AddrDefs)
7006     Worklist.push_back(I);
7007   while (!Worklist.empty()) {
7008     Instruction *I = Worklist.pop_back_val();
7009     for (auto &Op : I->operands())
7010       if (auto *InstOp = dyn_cast<Instruction>(Op))
7011         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7012             AddrDefs.insert(InstOp).second)
7013           Worklist.push_back(InstOp);
7014   }
7015 
7016   for (auto *I : AddrDefs) {
7017     if (isa<LoadInst>(I)) {
7018       // Setting the desired widening decision should ideally be handled in
7019       // by cost functions, but since this involves the task of finding out
7020       // if the loaded register is involved in an address computation, it is
7021       // instead changed here when we know this is the case.
7022       InstWidening Decision = getWideningDecision(I, VF);
7023       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7024         // Scalarize a widened load of address.
7025         setWideningDecision(
7026             I, VF, CM_Scalarize,
7027             (VF.getKnownMinValue() *
7028              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7029       else if (auto Group = getInterleavedAccessGroup(I)) {
7030         // Scalarize an interleave group of address loads.
7031         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7032           if (Instruction *Member = Group->getMember(I))
7033             setWideningDecision(
7034                 Member, VF, CM_Scalarize,
7035                 (VF.getKnownMinValue() *
7036                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7037         }
7038       }
7039     } else
7040       // Make sure I gets scalarized and a cost estimate without
7041       // scalarization overhead.
7042       ForcedScalars[VF].insert(I);
7043   }
7044 }
7045 
7046 InstructionCost
7047 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7048                                                Type *&VectorTy) {
7049   Type *RetTy = I->getType();
7050   if (canTruncateToMinimalBitwidth(I, VF))
7051     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7052   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
7053   auto SE = PSE.getSE();
7054   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7055 
7056   // TODO: We need to estimate the cost of intrinsic calls.
7057   switch (I->getOpcode()) {
7058   case Instruction::GetElementPtr:
7059     // We mark this instruction as zero-cost because the cost of GEPs in
7060     // vectorized code depends on whether the corresponding memory instruction
7061     // is scalarized or not. Therefore, we handle GEPs with the memory
7062     // instruction cost.
7063     return 0;
7064   case Instruction::Br: {
7065     // In cases of scalarized and predicated instructions, there will be VF
7066     // predicated blocks in the vectorized loop. Each branch around these
7067     // blocks requires also an extract of its vector compare i1 element.
7068     bool ScalarPredicatedBB = false;
7069     BranchInst *BI = cast<BranchInst>(I);
7070     if (VF.isVector() && BI->isConditional() &&
7071         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7072          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7073       ScalarPredicatedBB = true;
7074 
7075     if (ScalarPredicatedBB) {
7076       // Return cost for branches around scalarized and predicated blocks.
7077       assert(!VF.isScalable() && "scalable vectors not yet supported.");
7078       auto *Vec_i1Ty =
7079           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7080       return (TTI.getScalarizationOverhead(
7081                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
7082                   false, true) +
7083               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
7084                VF.getKnownMinValue()));
7085     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7086       // The back-edge branch will remain, as will all scalar branches.
7087       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7088     else
7089       // This branch will be eliminated by if-conversion.
7090       return 0;
7091     // Note: We currently assume zero cost for an unconditional branch inside
7092     // a predicated block since it will become a fall-through, although we
7093     // may decide in the future to call TTI for all branches.
7094   }
7095   case Instruction::PHI: {
7096     auto *Phi = cast<PHINode>(I);
7097 
7098     // First-order recurrences are replaced by vector shuffles inside the loop.
7099     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7100     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7101       return TTI.getShuffleCost(
7102           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7103           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7104 
7105     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7106     // converted into select instructions. We require N - 1 selects per phi
7107     // node, where N is the number of incoming values.
7108     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7109       return (Phi->getNumIncomingValues() - 1) *
7110              TTI.getCmpSelInstrCost(
7111                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7112                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7113                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7114 
7115     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7116   }
7117   case Instruction::UDiv:
7118   case Instruction::SDiv:
7119   case Instruction::URem:
7120   case Instruction::SRem:
7121     // If we have a predicated instruction, it may not be executed for each
7122     // vector lane. Get the scalarization cost and scale this amount by the
7123     // probability of executing the predicated block. If the instruction is not
7124     // predicated, we fall through to the next case.
7125     if (VF.isVector() && isScalarWithPredication(I)) {
7126       InstructionCost Cost = 0;
7127 
7128       // These instructions have a non-void type, so account for the phi nodes
7129       // that we will create. This cost is likely to be zero. The phi node
7130       // cost, if any, should be scaled by the block probability because it
7131       // models a copy at the end of each predicated block.
7132       Cost += VF.getKnownMinValue() *
7133               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7134 
7135       // The cost of the non-predicated instruction.
7136       Cost += VF.getKnownMinValue() *
7137               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7138 
7139       // The cost of insertelement and extractelement instructions needed for
7140       // scalarization.
7141       Cost += getScalarizationOverhead(I, VF);
7142 
7143       // Scale the cost by the probability of executing the predicated blocks.
7144       // This assumes the predicated block for each vector lane is equally
7145       // likely.
7146       return Cost / getReciprocalPredBlockProb();
7147     }
7148     LLVM_FALLTHROUGH;
7149   case Instruction::Add:
7150   case Instruction::FAdd:
7151   case Instruction::Sub:
7152   case Instruction::FSub:
7153   case Instruction::Mul:
7154   case Instruction::FMul:
7155   case Instruction::FDiv:
7156   case Instruction::FRem:
7157   case Instruction::Shl:
7158   case Instruction::LShr:
7159   case Instruction::AShr:
7160   case Instruction::And:
7161   case Instruction::Or:
7162   case Instruction::Xor: {
7163     // Since we will replace the stride by 1 the multiplication should go away.
7164     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7165       return 0;
7166     // Certain instructions can be cheaper to vectorize if they have a constant
7167     // second vector operand. One example of this are shifts on x86.
7168     Value *Op2 = I->getOperand(1);
7169     TargetTransformInfo::OperandValueProperties Op2VP;
7170     TargetTransformInfo::OperandValueKind Op2VK =
7171         TTI.getOperandInfo(Op2, Op2VP);
7172     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7173       Op2VK = TargetTransformInfo::OK_UniformValue;
7174 
7175     SmallVector<const Value *, 4> Operands(I->operand_values());
7176     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7177     return N * TTI.getArithmeticInstrCost(
7178                    I->getOpcode(), VectorTy, CostKind,
7179                    TargetTransformInfo::OK_AnyValue,
7180                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7181   }
7182   case Instruction::FNeg: {
7183     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7184     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7185     return N * TTI.getArithmeticInstrCost(
7186                    I->getOpcode(), VectorTy, CostKind,
7187                    TargetTransformInfo::OK_AnyValue,
7188                    TargetTransformInfo::OK_AnyValue,
7189                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
7190                    I->getOperand(0), I);
7191   }
7192   case Instruction::Select: {
7193     SelectInst *SI = cast<SelectInst>(I);
7194     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7195     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7196     Type *CondTy = SI->getCondition()->getType();
7197     if (!ScalarCond) {
7198       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7199       CondTy = VectorType::get(CondTy, VF);
7200     }
7201     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7202                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7203   }
7204   case Instruction::ICmp:
7205   case Instruction::FCmp: {
7206     Type *ValTy = I->getOperand(0)->getType();
7207     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7208     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7209       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7210     VectorTy = ToVectorTy(ValTy, VF);
7211     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7212                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7213   }
7214   case Instruction::Store:
7215   case Instruction::Load: {
7216     ElementCount Width = VF;
7217     if (Width.isVector()) {
7218       InstWidening Decision = getWideningDecision(I, Width);
7219       assert(Decision != CM_Unknown &&
7220              "CM decision should be taken at this point");
7221       if (Decision == CM_Scalarize)
7222         Width = ElementCount::getFixed(1);
7223     }
7224     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7225     return getMemoryInstructionCost(I, VF);
7226   }
7227   case Instruction::ZExt:
7228   case Instruction::SExt:
7229   case Instruction::FPToUI:
7230   case Instruction::FPToSI:
7231   case Instruction::FPExt:
7232   case Instruction::PtrToInt:
7233   case Instruction::IntToPtr:
7234   case Instruction::SIToFP:
7235   case Instruction::UIToFP:
7236   case Instruction::Trunc:
7237   case Instruction::FPTrunc:
7238   case Instruction::BitCast: {
7239     // Computes the CastContextHint from a Load/Store instruction.
7240     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7241       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7242              "Expected a load or a store!");
7243 
7244       if (VF.isScalar() || !TheLoop->contains(I))
7245         return TTI::CastContextHint::Normal;
7246 
7247       switch (getWideningDecision(I, VF)) {
7248       case LoopVectorizationCostModel::CM_GatherScatter:
7249         return TTI::CastContextHint::GatherScatter;
7250       case LoopVectorizationCostModel::CM_Interleave:
7251         return TTI::CastContextHint::Interleave;
7252       case LoopVectorizationCostModel::CM_Scalarize:
7253       case LoopVectorizationCostModel::CM_Widen:
7254         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7255                                         : TTI::CastContextHint::Normal;
7256       case LoopVectorizationCostModel::CM_Widen_Reverse:
7257         return TTI::CastContextHint::Reversed;
7258       case LoopVectorizationCostModel::CM_Unknown:
7259         llvm_unreachable("Instr did not go through cost modelling?");
7260       }
7261 
7262       llvm_unreachable("Unhandled case!");
7263     };
7264 
7265     unsigned Opcode = I->getOpcode();
7266     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7267     // For Trunc, the context is the only user, which must be a StoreInst.
7268     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7269       if (I->hasOneUse())
7270         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7271           CCH = ComputeCCH(Store);
7272     }
7273     // For Z/Sext, the context is the operand, which must be a LoadInst.
7274     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7275              Opcode == Instruction::FPExt) {
7276       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7277         CCH = ComputeCCH(Load);
7278     }
7279 
7280     // We optimize the truncation of induction variables having constant
7281     // integer steps. The cost of these truncations is the same as the scalar
7282     // operation.
7283     if (isOptimizableIVTruncate(I, VF)) {
7284       auto *Trunc = cast<TruncInst>(I);
7285       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7286                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7287     }
7288 
7289     Type *SrcScalarTy = I->getOperand(0)->getType();
7290     Type *SrcVecTy =
7291         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7292     if (canTruncateToMinimalBitwidth(I, VF)) {
7293       // This cast is going to be shrunk. This may remove the cast or it might
7294       // turn it into slightly different cast. For example, if MinBW == 16,
7295       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7296       //
7297       // Calculate the modified src and dest types.
7298       Type *MinVecTy = VectorTy;
7299       if (Opcode == Instruction::Trunc) {
7300         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7301         VectorTy =
7302             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7303       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7304         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7305         VectorTy =
7306             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7307       }
7308     }
7309 
7310     assert(!VF.isScalable() && "VF is assumed to be non scalable");
7311     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7312     return N *
7313            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7314   }
7315   case Instruction::Call: {
7316     bool NeedToScalarize;
7317     CallInst *CI = cast<CallInst>(I);
7318     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7319     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7320       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7321       return std::min(CallCost, IntrinsicCost);
7322     }
7323     return CallCost;
7324   }
7325   case Instruction::ExtractValue:
7326     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7327   default:
7328     // The cost of executing VF copies of the scalar instruction. This opcode
7329     // is unknown. Assume that it is the same as 'mul'.
7330     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
7331                                        Instruction::Mul, VectorTy, CostKind) +
7332            getScalarizationOverhead(I, VF);
7333   } // end of switch.
7334 }
7335 
7336 char LoopVectorize::ID = 0;
7337 
7338 static const char lv_name[] = "Loop Vectorization";
7339 
7340 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7341 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7342 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7343 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7344 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7345 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7346 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7347 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7348 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7349 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7350 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7351 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7352 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7353 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7354 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7355 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7356 
7357 namespace llvm {
7358 
7359 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7360 
7361 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7362                               bool VectorizeOnlyWhenForced) {
7363   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7364 }
7365 
7366 } // end namespace llvm
7367 
7368 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7369   // Check if the pointer operand of a load or store instruction is
7370   // consecutive.
7371   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7372     return Legal->isConsecutivePtr(Ptr);
7373   return false;
7374 }
7375 
7376 void LoopVectorizationCostModel::collectValuesToIgnore() {
7377   // Ignore ephemeral values.
7378   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7379 
7380   // Ignore type-promoting instructions we identified during reduction
7381   // detection.
7382   for (auto &Reduction : Legal->getReductionVars()) {
7383     RecurrenceDescriptor &RedDes = Reduction.second;
7384     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7385     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7386   }
7387   // Ignore type-casting instructions we identified during induction
7388   // detection.
7389   for (auto &Induction : Legal->getInductionVars()) {
7390     InductionDescriptor &IndDes = Induction.second;
7391     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7392     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7393   }
7394 }
7395 
7396 void LoopVectorizationCostModel::collectInLoopReductions() {
7397   for (auto &Reduction : Legal->getReductionVars()) {
7398     PHINode *Phi = Reduction.first;
7399     RecurrenceDescriptor &RdxDesc = Reduction.second;
7400 
7401     // We don't collect reductions that are type promoted (yet).
7402     if (RdxDesc.getRecurrenceType() != Phi->getType())
7403       continue;
7404 
7405     // If the target would prefer this reduction to happen "in-loop", then we
7406     // want to record it as such.
7407     unsigned Opcode = RdxDesc.getOpcode();
7408     if (!PreferInLoopReductions &&
7409         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7410                                    TargetTransformInfo::ReductionFlags()))
7411       continue;
7412 
7413     // Check that we can correctly put the reductions into the loop, by
7414     // finding the chain of operations that leads from the phi to the loop
7415     // exit value.
7416     SmallVector<Instruction *, 4> ReductionOperations =
7417         RdxDesc.getReductionOpChain(Phi, TheLoop);
7418     bool InLoop = !ReductionOperations.empty();
7419     if (InLoop)
7420       InLoopReductionChains[Phi] = ReductionOperations;
7421     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7422                       << " reduction for phi: " << *Phi << "\n");
7423   }
7424 }
7425 
7426 // TODO: we could return a pair of values that specify the max VF and
7427 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7428 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7429 // doesn't have a cost model that can choose which plan to execute if
7430 // more than one is generated.
7431 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7432                                  LoopVectorizationCostModel &CM) {
7433   unsigned WidestType;
7434   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7435   return WidestVectorRegBits / WidestType;
7436 }
7437 
7438 VectorizationFactor
7439 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7440   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7441   ElementCount VF = UserVF;
7442   // Outer loop handling: They may require CFG and instruction level
7443   // transformations before even evaluating whether vectorization is profitable.
7444   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7445   // the vectorization pipeline.
7446   if (!OrigLoop->isInnermost()) {
7447     // If the user doesn't provide a vectorization factor, determine a
7448     // reasonable one.
7449     if (UserVF.isZero()) {
7450       VF = ElementCount::getFixed(
7451           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
7452       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7453 
7454       // Make sure we have a VF > 1 for stress testing.
7455       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7456         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7457                           << "overriding computed VF.\n");
7458         VF = ElementCount::getFixed(4);
7459       }
7460     }
7461     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7462     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7463            "VF needs to be a power of two");
7464     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7465                       << "VF " << VF << " to build VPlans.\n");
7466     buildVPlans(VF, VF);
7467 
7468     // For VPlan build stress testing, we bail out after VPlan construction.
7469     if (VPlanBuildStressTest)
7470       return VectorizationFactor::Disabled();
7471 
7472     return {VF, 0 /*Cost*/};
7473   }
7474 
7475   LLVM_DEBUG(
7476       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7477                 "VPlan-native path.\n");
7478   return VectorizationFactor::Disabled();
7479 }
7480 
7481 Optional<VectorizationFactor>
7482 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7483   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7484   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7485   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7486     return None;
7487 
7488   // Invalidate interleave groups if all blocks of loop will be predicated.
7489   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7490       !useMaskedInterleavedAccesses(*TTI)) {
7491     LLVM_DEBUG(
7492         dbgs()
7493         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7494            "which requires masked-interleaved support.\n");
7495     if (CM.InterleaveInfo.invalidateGroups())
7496       // Invalidating interleave groups also requires invalidating all decisions
7497       // based on them, which includes widening decisions and uniform and scalar
7498       // values.
7499       CM.invalidateCostModelingDecisions();
7500   }
7501 
7502   ElementCount MaxVF = MaybeMaxVF.getValue();
7503   assert(MaxVF.isNonZero() && "MaxVF is zero.");
7504 
7505   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
7506   if (!UserVF.isZero() &&
7507       (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
7508     // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
7509     // VFs here, this should be reverted to only use legal UserVFs once the
7510     // loop below supports scalable VFs.
7511     ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
7512     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
7513                       << " VF " << VF << ".\n");
7514     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7515            "VF needs to be a power of two");
7516     // Collect the instructions (and their associated costs) that will be more
7517     // profitable to scalarize.
7518     CM.selectUserVectorizationFactor(VF);
7519     CM.collectInLoopReductions();
7520     buildVPlansWithVPRecipes(VF, VF);
7521     LLVM_DEBUG(printPlans(dbgs()));
7522     return {{VF, 0}};
7523   }
7524 
7525   assert(!MaxVF.isScalable() &&
7526          "Scalable vectors not yet supported beyond this point");
7527 
7528   for (ElementCount VF = ElementCount::getFixed(1);
7529        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7530     // Collect Uniform and Scalar instructions after vectorization with VF.
7531     CM.collectUniformsAndScalars(VF);
7532 
7533     // Collect the instructions (and their associated costs) that will be more
7534     // profitable to scalarize.
7535     if (VF.isVector())
7536       CM.collectInstsToScalarize(VF);
7537   }
7538 
7539   CM.collectInLoopReductions();
7540 
7541   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7542   LLVM_DEBUG(printPlans(dbgs()));
7543   if (MaxVF.isScalar())
7544     return VectorizationFactor::Disabled();
7545 
7546   // Select the optimal vectorization factor.
7547   return CM.selectVectorizationFactor(MaxVF);
7548 }
7549 
7550 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7551   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7552                     << '\n');
7553   BestVF = VF;
7554   BestUF = UF;
7555 
7556   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7557     return !Plan->hasVF(VF);
7558   });
7559   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7560 }
7561 
7562 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7563                                            DominatorTree *DT) {
7564   // Perform the actual loop transformation.
7565 
7566   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7567   VPCallbackILV CallbackILV(ILV);
7568 
7569   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7570 
7571   VPTransformState State{*BestVF, BestUF,      LI,
7572                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7573                          &ILV,    CallbackILV};
7574   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7575   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7576   State.CanonicalIV = ILV.Induction;
7577 
7578   ILV.printDebugTracesAtStart();
7579 
7580   //===------------------------------------------------===//
7581   //
7582   // Notice: any optimization or new instruction that go
7583   // into the code below should also be implemented in
7584   // the cost-model.
7585   //
7586   //===------------------------------------------------===//
7587 
7588   // 2. Copy and widen instructions from the old loop into the new loop.
7589   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7590   VPlans.front()->execute(&State);
7591 
7592   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7593   //    predication, updating analyses.
7594   ILV.fixVectorizedLoop();
7595 
7596   ILV.printDebugTracesAtEnd();
7597 }
7598 
7599 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7600     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7601 
7602   // We create new control-flow for the vectorized loop, so the original exit
7603   // conditions will be dead after vectorization if it's only used by the
7604   // terminator
7605   SmallVector<BasicBlock*> ExitingBlocks;
7606   OrigLoop->getExitingBlocks(ExitingBlocks);
7607   for (auto *BB : ExitingBlocks) {
7608     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7609     if (!Cmp || !Cmp->hasOneUse())
7610       continue;
7611 
7612     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7613     if (!DeadInstructions.insert(Cmp).second)
7614       continue;
7615 
7616     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7617     // TODO: can recurse through operands in general
7618     for (Value *Op : Cmp->operands()) {
7619       if (isa<TruncInst>(Op) && Op->hasOneUse())
7620           DeadInstructions.insert(cast<Instruction>(Op));
7621     }
7622   }
7623 
7624   // We create new "steps" for induction variable updates to which the original
7625   // induction variables map. An original update instruction will be dead if
7626   // all its users except the induction variable are dead.
7627   auto *Latch = OrigLoop->getLoopLatch();
7628   for (auto &Induction : Legal->getInductionVars()) {
7629     PHINode *Ind = Induction.first;
7630     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7631 
7632     // If the tail is to be folded by masking, the primary induction variable,
7633     // if exists, isn't dead: it will be used for masking. Don't kill it.
7634     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7635       continue;
7636 
7637     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7638           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7639         }))
7640       DeadInstructions.insert(IndUpdate);
7641 
7642     // We record as "Dead" also the type-casting instructions we had identified
7643     // during induction analysis. We don't need any handling for them in the
7644     // vectorized loop because we have proven that, under a proper runtime
7645     // test guarding the vectorized loop, the value of the phi, and the casted
7646     // value of the phi, are the same. The last instruction in this casting chain
7647     // will get its scalar/vector/widened def from the scalar/vector/widened def
7648     // of the respective phi node. Any other casts in the induction def-use chain
7649     // have no other uses outside the phi update chain, and will be ignored.
7650     InductionDescriptor &IndDes = Induction.second;
7651     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7652     DeadInstructions.insert(Casts.begin(), Casts.end());
7653   }
7654 }
7655 
7656 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7657 
7658 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7659 
7660 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7661                                         Instruction::BinaryOps BinOp) {
7662   // When unrolling and the VF is 1, we only need to add a simple scalar.
7663   Type *Ty = Val->getType();
7664   assert(!Ty->isVectorTy() && "Val must be a scalar");
7665 
7666   if (Ty->isFloatingPointTy()) {
7667     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7668 
7669     // Floating point operations had to be 'fast' to enable the unrolling.
7670     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7671     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7672   }
7673   Constant *C = ConstantInt::get(Ty, StartIdx);
7674   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7675 }
7676 
7677 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7678   SmallVector<Metadata *, 4> MDs;
7679   // Reserve first location for self reference to the LoopID metadata node.
7680   MDs.push_back(nullptr);
7681   bool IsUnrollMetadata = false;
7682   MDNode *LoopID = L->getLoopID();
7683   if (LoopID) {
7684     // First find existing loop unrolling disable metadata.
7685     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7686       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7687       if (MD) {
7688         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7689         IsUnrollMetadata =
7690             S && S->getString().startswith("llvm.loop.unroll.disable");
7691       }
7692       MDs.push_back(LoopID->getOperand(i));
7693     }
7694   }
7695 
7696   if (!IsUnrollMetadata) {
7697     // Add runtime unroll disable metadata.
7698     LLVMContext &Context = L->getHeader()->getContext();
7699     SmallVector<Metadata *, 1> DisableOperands;
7700     DisableOperands.push_back(
7701         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7702     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7703     MDs.push_back(DisableNode);
7704     MDNode *NewLoopID = MDNode::get(Context, MDs);
7705     // Set operand 0 to refer to the loop id itself.
7706     NewLoopID->replaceOperandWith(0, NewLoopID);
7707     L->setLoopID(NewLoopID);
7708   }
7709 }
7710 
7711 //===--------------------------------------------------------------------===//
7712 // EpilogueVectorizerMainLoop
7713 //===--------------------------------------------------------------------===//
7714 
7715 /// This function is partially responsible for generating the control flow
7716 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7717 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7718   MDNode *OrigLoopID = OrigLoop->getLoopID();
7719   Loop *Lp = createVectorLoopSkeleton("");
7720 
7721   // Generate the code to check the minimum iteration count of the vector
7722   // epilogue (see below).
7723   EPI.EpilogueIterationCountCheck =
7724       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
7725   EPI.EpilogueIterationCountCheck->setName("iter.check");
7726 
7727   // Generate the code to check any assumptions that we've made for SCEV
7728   // expressions.
7729   BasicBlock *SavedPreHeader = LoopVectorPreHeader;
7730   emitSCEVChecks(Lp, LoopScalarPreHeader);
7731 
7732   // If a safety check was generated save it.
7733   if (SavedPreHeader != LoopVectorPreHeader)
7734     EPI.SCEVSafetyCheck = SavedPreHeader;
7735 
7736   // Generate the code that checks at runtime if arrays overlap. We put the
7737   // checks into a separate block to make the more common case of few elements
7738   // faster.
7739   SavedPreHeader = LoopVectorPreHeader;
7740   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
7741 
7742   // If a safety check was generated save/overwite it.
7743   if (SavedPreHeader != LoopVectorPreHeader)
7744     EPI.MemSafetyCheck = SavedPreHeader;
7745 
7746   // Generate the iteration count check for the main loop, *after* the check
7747   // for the epilogue loop, so that the path-length is shorter for the case
7748   // that goes directly through the vector epilogue. The longer-path length for
7749   // the main loop is compensated for, by the gain from vectorizing the larger
7750   // trip count. Note: the branch will get updated later on when we vectorize
7751   // the epilogue.
7752   EPI.MainLoopIterationCountCheck =
7753       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
7754 
7755   // Generate the induction variable.
7756   OldInduction = Legal->getPrimaryInduction();
7757   Type *IdxTy = Legal->getWidestInductionType();
7758   Value *StartIdx = ConstantInt::get(IdxTy, 0);
7759   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7760   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7761   EPI.VectorTripCount = CountRoundDown;
7762   Induction =
7763       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7764                               getDebugLocFromInstOrOperands(OldInduction));
7765 
7766   // Skip induction resume value creation here because they will be created in
7767   // the second pass. If we created them here, they wouldn't be used anyway,
7768   // because the vplan in the second pass still contains the inductions from the
7769   // original loop.
7770 
7771   return completeLoopSkeleton(Lp, OrigLoopID);
7772 }
7773 
7774 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7775   LLVM_DEBUG({
7776     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7777            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7778            << ", Main Loop UF:" << EPI.MainLoopUF
7779            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7780            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7781   });
7782 }
7783 
7784 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7785   DEBUG_WITH_TYPE(VerboseDebug, {
7786     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
7787   });
7788 }
7789 
7790 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
7791     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
7792   assert(L && "Expected valid Loop.");
7793   assert(Bypass && "Expected valid bypass basic block.");
7794   unsigned VFactor =
7795       ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
7796   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7797   Value *Count = getOrCreateTripCount(L);
7798   // Reuse existing vector loop preheader for TC checks.
7799   // Note that new preheader block is generated for vector loop.
7800   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7801   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7802 
7803   // Generate code to check if the loop's trip count is less than VF * UF of the
7804   // main vector loop.
7805   auto P =
7806       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7807 
7808   Value *CheckMinIters = Builder.CreateICmp(
7809       P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
7810       "min.iters.check");
7811 
7812   if (!ForEpilogue)
7813     TCCheckBlock->setName("vector.main.loop.iter.check");
7814 
7815   // Create new preheader for vector loop.
7816   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7817                                    DT, LI, nullptr, "vector.ph");
7818 
7819   if (ForEpilogue) {
7820     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7821                                  DT->getNode(Bypass)->getIDom()) &&
7822            "TC check is expected to dominate Bypass");
7823 
7824     // Update dominator for Bypass & LoopExit.
7825     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7826     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7827 
7828     LoopBypassBlocks.push_back(TCCheckBlock);
7829 
7830     // Save the trip count so we don't have to regenerate it in the
7831     // vec.epilog.iter.check. This is safe to do because the trip count
7832     // generated here dominates the vector epilog iter check.
7833     EPI.TripCount = Count;
7834   }
7835 
7836   ReplaceInstWithInst(
7837       TCCheckBlock->getTerminator(),
7838       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7839 
7840   return TCCheckBlock;
7841 }
7842 
7843 //===--------------------------------------------------------------------===//
7844 // EpilogueVectorizerEpilogueLoop
7845 //===--------------------------------------------------------------------===//
7846 
7847 /// This function is partially responsible for generating the control flow
7848 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7849 BasicBlock *
7850 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7851   MDNode *OrigLoopID = OrigLoop->getLoopID();
7852   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
7853 
7854   // Now, compare the remaining count and if there aren't enough iterations to
7855   // execute the vectorized epilogue skip to the scalar part.
7856   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7857   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7858   LoopVectorPreHeader =
7859       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7860                  LI, nullptr, "vec.epilog.ph");
7861   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
7862                                           VecEpilogueIterationCountCheck);
7863 
7864   // Adjust the control flow taking the state info from the main loop
7865   // vectorization into account.
7866   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7867          "expected this to be saved from the previous pass.");
7868   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7869       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7870 
7871   DT->changeImmediateDominator(LoopVectorPreHeader,
7872                                EPI.MainLoopIterationCountCheck);
7873 
7874   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7875       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7876 
7877   if (EPI.SCEVSafetyCheck)
7878     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7879         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7880   if (EPI.MemSafetyCheck)
7881     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7882         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7883 
7884   DT->changeImmediateDominator(
7885       VecEpilogueIterationCountCheck,
7886       VecEpilogueIterationCountCheck->getSinglePredecessor());
7887 
7888   DT->changeImmediateDominator(LoopScalarPreHeader,
7889                                EPI.EpilogueIterationCountCheck);
7890   DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
7891 
7892   // Keep track of bypass blocks, as they feed start values to the induction
7893   // phis in the scalar loop preheader.
7894   if (EPI.SCEVSafetyCheck)
7895     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7896   if (EPI.MemSafetyCheck)
7897     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7898   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7899 
7900   // Generate a resume induction for the vector epilogue and put it in the
7901   // vector epilogue preheader
7902   Type *IdxTy = Legal->getWidestInductionType();
7903   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7904                                          LoopVectorPreHeader->getFirstNonPHI());
7905   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7906   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7907                            EPI.MainLoopIterationCountCheck);
7908 
7909   // Generate the induction variable.
7910   OldInduction = Legal->getPrimaryInduction();
7911   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7912   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7913   Value *StartIdx = EPResumeVal;
7914   Induction =
7915       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7916                               getDebugLocFromInstOrOperands(OldInduction));
7917 
7918   // Generate induction resume values. These variables save the new starting
7919   // indexes for the scalar loop. They are used to test if there are any tail
7920   // iterations left once the vector loop has completed.
7921   // Note that when the vectorized epilogue is skipped due to iteration count
7922   // check, then the resume value for the induction variable comes from
7923   // the trip count of the main vector loop, hence passing the AdditionalBypass
7924   // argument.
7925   createInductionResumeValues(Lp, CountRoundDown,
7926                               {VecEpilogueIterationCountCheck,
7927                                EPI.VectorTripCount} /* AdditionalBypass */);
7928 
7929   AddRuntimeUnrollDisableMetaData(Lp);
7930   return completeLoopSkeleton(Lp, OrigLoopID);
7931 }
7932 
7933 BasicBlock *
7934 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7935     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
7936 
7937   assert(EPI.TripCount &&
7938          "Expected trip count to have been safed in the first pass.");
7939   assert(
7940       (!isa<Instruction>(EPI.TripCount) ||
7941        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7942       "saved trip count does not dominate insertion point.");
7943   Value *TC = EPI.TripCount;
7944   IRBuilder<> Builder(Insert->getTerminator());
7945   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7946 
7947   // Generate code to check if the loop's trip count is less than VF * UF of the
7948   // vector epilogue loop.
7949   auto P =
7950       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7951 
7952   Value *CheckMinIters = Builder.CreateICmp(
7953       P, Count,
7954       ConstantInt::get(Count->getType(),
7955                        EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
7956       "min.epilog.iters.check");
7957 
7958   ReplaceInstWithInst(
7959       Insert->getTerminator(),
7960       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7961 
7962   LoopBypassBlocks.push_back(Insert);
7963   return Insert;
7964 }
7965 
7966 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7967   LLVM_DEBUG({
7968     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7969            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7970            << ", Main Loop UF:" << EPI.MainLoopUF
7971            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7972            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7973   });
7974 }
7975 
7976 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7977   DEBUG_WITH_TYPE(VerboseDebug, {
7978     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
7979   });
7980 }
7981 
7982 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7983     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7984   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7985   bool PredicateAtRangeStart = Predicate(Range.Start);
7986 
7987   for (ElementCount TmpVF = Range.Start * 2;
7988        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7989     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7990       Range.End = TmpVF;
7991       break;
7992     }
7993 
7994   return PredicateAtRangeStart;
7995 }
7996 
7997 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7998 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7999 /// of VF's starting at a given VF and extending it as much as possible. Each
8000 /// vectorization decision can potentially shorten this sub-range during
8001 /// buildVPlan().
8002 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8003                                            ElementCount MaxVF) {
8004   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8005   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8006     VFRange SubRange = {VF, MaxVFPlusOne};
8007     VPlans.push_back(buildVPlan(SubRange));
8008     VF = SubRange.End;
8009   }
8010 }
8011 
8012 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8013                                          VPlanPtr &Plan) {
8014   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8015 
8016   // Look for cached value.
8017   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8018   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8019   if (ECEntryIt != EdgeMaskCache.end())
8020     return ECEntryIt->second;
8021 
8022   VPValue *SrcMask = createBlockInMask(Src, Plan);
8023 
8024   // The terminator has to be a branch inst!
8025   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8026   assert(BI && "Unexpected terminator found");
8027 
8028   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8029     return EdgeMaskCache[Edge] = SrcMask;
8030 
8031   // If source is an exiting block, we know the exit edge is dynamically dead
8032   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8033   // adding uses of an otherwise potentially dead instruction.
8034   if (OrigLoop->isLoopExiting(Src))
8035     return EdgeMaskCache[Edge] = SrcMask;
8036 
8037   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8038   assert(EdgeMask && "No Edge Mask found for condition");
8039 
8040   if (BI->getSuccessor(0) != Dst)
8041     EdgeMask = Builder.createNot(EdgeMask);
8042 
8043   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
8044     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
8045 
8046   return EdgeMaskCache[Edge] = EdgeMask;
8047 }
8048 
8049 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8050   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8051 
8052   // Look for cached value.
8053   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8054   if (BCEntryIt != BlockMaskCache.end())
8055     return BCEntryIt->second;
8056 
8057   // All-one mask is modelled as no-mask following the convention for masked
8058   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8059   VPValue *BlockMask = nullptr;
8060 
8061   if (OrigLoop->getHeader() == BB) {
8062     if (!CM.blockNeedsPredication(BB))
8063       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8064 
8065     // Create the block in mask as the first non-phi instruction in the block.
8066     VPBuilder::InsertPointGuard Guard(Builder);
8067     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8068     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8069 
8070     // Introduce the early-exit compare IV <= BTC to form header block mask.
8071     // This is used instead of IV < TC because TC may wrap, unlike BTC.
8072     // Start by constructing the desired canonical IV.
8073     VPValue *IV = nullptr;
8074     if (Legal->getPrimaryInduction())
8075       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8076     else {
8077       auto IVRecipe = new VPWidenCanonicalIVRecipe();
8078       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
8079       IV = IVRecipe->getVPValue();
8080     }
8081     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8082     bool TailFolded = !CM.isScalarEpilogueAllowed();
8083 
8084     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8085       // While ActiveLaneMask is a binary op that consumes the loop tripcount
8086       // as a second argument, we only pass the IV here and extract the
8087       // tripcount from the transform state where codegen of the VP instructions
8088       // happen.
8089       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8090     } else {
8091       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8092     }
8093     return BlockMaskCache[BB] = BlockMask;
8094   }
8095 
8096   // This is the block mask. We OR all incoming edges.
8097   for (auto *Predecessor : predecessors(BB)) {
8098     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8099     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8100       return BlockMaskCache[BB] = EdgeMask;
8101 
8102     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8103       BlockMask = EdgeMask;
8104       continue;
8105     }
8106 
8107     BlockMask = Builder.createOr(BlockMask, EdgeMask);
8108   }
8109 
8110   return BlockMaskCache[BB] = BlockMask;
8111 }
8112 
8113 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
8114                                                 VPlanPtr &Plan) {
8115   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8116          "Must be called with either a load or store");
8117 
8118   auto willWiden = [&](ElementCount VF) -> bool {
8119     if (VF.isScalar())
8120       return false;
8121     LoopVectorizationCostModel::InstWidening Decision =
8122         CM.getWideningDecision(I, VF);
8123     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8124            "CM decision should be taken at this point.");
8125     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8126       return true;
8127     if (CM.isScalarAfterVectorization(I, VF) ||
8128         CM.isProfitableToScalarize(I, VF))
8129       return false;
8130     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8131   };
8132 
8133   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8134     return nullptr;
8135 
8136   VPValue *Mask = nullptr;
8137   if (Legal->isMaskRequired(I))
8138     Mask = createBlockInMask(I->getParent(), Plan);
8139 
8140   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
8141   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8142     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
8143 
8144   StoreInst *Store = cast<StoreInst>(I);
8145   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
8146   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
8147 }
8148 
8149 VPWidenIntOrFpInductionRecipe *
8150 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const {
8151   // Check if this is an integer or fp induction. If so, build the recipe that
8152   // produces its scalar and vector values.
8153   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8154   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8155       II.getKind() == InductionDescriptor::IK_FpInduction) {
8156     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8157     return new VPWidenIntOrFpInductionRecipe(Phi, Start);
8158   }
8159 
8160   return nullptr;
8161 }
8162 
8163 VPWidenIntOrFpInductionRecipe *
8164 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
8165                                                 VPlan &Plan) const {
8166   // Optimize the special case where the source is a constant integer
8167   // induction variable. Notice that we can only optimize the 'trunc' case
8168   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8169   // (c) other casts depend on pointer size.
8170 
8171   // Determine whether \p K is a truncation based on an induction variable that
8172   // can be optimized.
8173   auto isOptimizableIVTruncate =
8174       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8175     return [=](ElementCount VF) -> bool {
8176       return CM.isOptimizableIVTruncate(K, VF);
8177     };
8178   };
8179 
8180   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8181           isOptimizableIVTruncate(I), Range)) {
8182 
8183     InductionDescriptor II =
8184         Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
8185     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8186     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8187                                              Start, I);
8188   }
8189   return nullptr;
8190 }
8191 
8192 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
8193   // We know that all PHIs in non-header blocks are converted into selects, so
8194   // we don't have to worry about the insertion order and we can just use the
8195   // builder. At this point we generate the predication tree. There may be
8196   // duplications since this is a simple recursive scan, but future
8197   // optimizations will clean it up.
8198 
8199   SmallVector<VPValue *, 2> Operands;
8200   unsigned NumIncoming = Phi->getNumIncomingValues();
8201   for (unsigned In = 0; In < NumIncoming; In++) {
8202     VPValue *EdgeMask =
8203       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8204     assert((EdgeMask || NumIncoming == 1) &&
8205            "Multiple predecessors with one having a full mask");
8206     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
8207     if (EdgeMask)
8208       Operands.push_back(EdgeMask);
8209   }
8210   return new VPBlendRecipe(Phi, Operands);
8211 }
8212 
8213 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
8214                                                    VPlan &Plan) const {
8215 
8216   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8217       [this, CI](ElementCount VF) {
8218         return CM.isScalarWithPredication(CI, VF);
8219       },
8220       Range);
8221 
8222   if (IsPredicated)
8223     return nullptr;
8224 
8225   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8226   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8227              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8228              ID == Intrinsic::pseudoprobe))
8229     return nullptr;
8230 
8231   auto willWiden = [&](ElementCount VF) -> bool {
8232     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8233     // The following case may be scalarized depending on the VF.
8234     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8235     // version of the instruction.
8236     // Is it beneficial to perform intrinsic call compared to lib call?
8237     bool NeedToScalarize = false;
8238     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8239     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8240     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8241     assert(IntrinsicCost.isValid() && CallCost.isValid() &&
8242            "Cannot have invalid costs while widening");
8243     return UseVectorIntrinsic || !NeedToScalarize;
8244   };
8245 
8246   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8247     return nullptr;
8248 
8249   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
8250 }
8251 
8252 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8253   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8254          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8255   // Instruction should be widened, unless it is scalar after vectorization,
8256   // scalarization is profitable or it is predicated.
8257   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8258     return CM.isScalarAfterVectorization(I, VF) ||
8259            CM.isProfitableToScalarize(I, VF) ||
8260            CM.isScalarWithPredication(I, VF);
8261   };
8262   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8263                                                              Range);
8264 }
8265 
8266 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
8267   auto IsVectorizableOpcode = [](unsigned Opcode) {
8268     switch (Opcode) {
8269     case Instruction::Add:
8270     case Instruction::And:
8271     case Instruction::AShr:
8272     case Instruction::BitCast:
8273     case Instruction::FAdd:
8274     case Instruction::FCmp:
8275     case Instruction::FDiv:
8276     case Instruction::FMul:
8277     case Instruction::FNeg:
8278     case Instruction::FPExt:
8279     case Instruction::FPToSI:
8280     case Instruction::FPToUI:
8281     case Instruction::FPTrunc:
8282     case Instruction::FRem:
8283     case Instruction::FSub:
8284     case Instruction::ICmp:
8285     case Instruction::IntToPtr:
8286     case Instruction::LShr:
8287     case Instruction::Mul:
8288     case Instruction::Or:
8289     case Instruction::PtrToInt:
8290     case Instruction::SDiv:
8291     case Instruction::Select:
8292     case Instruction::SExt:
8293     case Instruction::Shl:
8294     case Instruction::SIToFP:
8295     case Instruction::SRem:
8296     case Instruction::Sub:
8297     case Instruction::Trunc:
8298     case Instruction::UDiv:
8299     case Instruction::UIToFP:
8300     case Instruction::URem:
8301     case Instruction::Xor:
8302     case Instruction::ZExt:
8303       return true;
8304     }
8305     return false;
8306   };
8307 
8308   if (!IsVectorizableOpcode(I->getOpcode()))
8309     return nullptr;
8310 
8311   // Success: widen this instruction.
8312   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
8313 }
8314 
8315 VPBasicBlock *VPRecipeBuilder::handleReplication(
8316     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8317     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
8318     VPlanPtr &Plan) {
8319   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8320       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8321       Range);
8322 
8323   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8324       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
8325       Range);
8326 
8327   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8328                                        IsUniform, IsPredicated);
8329   setRecipe(I, Recipe);
8330   Plan->addVPValue(I, Recipe);
8331 
8332   // Find if I uses a predicated instruction. If so, it will use its scalar
8333   // value. Avoid hoisting the insert-element which packs the scalar value into
8334   // a vector value, as that happens iff all users use the vector value.
8335   for (auto &Op : I->operands())
8336     if (auto *PredInst = dyn_cast<Instruction>(Op))
8337       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
8338         PredInst2Recipe[PredInst]->setAlsoPack(false);
8339 
8340   // Finalize the recipe for Instr, first if it is not predicated.
8341   if (!IsPredicated) {
8342     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8343     VPBB->appendRecipe(Recipe);
8344     return VPBB;
8345   }
8346   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8347   assert(VPBB->getSuccessors().empty() &&
8348          "VPBB has successors when handling predicated replication.");
8349   // Record predicated instructions for above packing optimizations.
8350   PredInst2Recipe[I] = Recipe;
8351   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8352   VPBlockUtils::insertBlockAfter(Region, VPBB);
8353   auto *RegSucc = new VPBasicBlock();
8354   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8355   return RegSucc;
8356 }
8357 
8358 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8359                                                       VPRecipeBase *PredRecipe,
8360                                                       VPlanPtr &Plan) {
8361   // Instructions marked for predication are replicated and placed under an
8362   // if-then construct to prevent side-effects.
8363 
8364   // Generate recipes to compute the block mask for this region.
8365   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8366 
8367   // Build the triangular if-then region.
8368   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8369   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8370   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8371   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8372   auto *PHIRecipe = Instr->getType()->isVoidTy()
8373                         ? nullptr
8374                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8375   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8376   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8377   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8378 
8379   // Note: first set Entry as region entry and then connect successors starting
8380   // from it in order, to propagate the "parent" of each VPBasicBlock.
8381   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8382   VPBlockUtils::connectBlocks(Pred, Exit);
8383 
8384   return Region;
8385 }
8386 
8387 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8388                                                       VFRange &Range,
8389                                                       VPlanPtr &Plan) {
8390   // First, check for specific widening recipes that deal with calls, memory
8391   // operations, inductions and Phi nodes.
8392   if (auto *CI = dyn_cast<CallInst>(Instr))
8393     return tryToWidenCall(CI, Range, *Plan);
8394 
8395   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8396     return tryToWidenMemory(Instr, Range, Plan);
8397 
8398   VPRecipeBase *Recipe;
8399   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8400     if (Phi->getParent() != OrigLoop->getHeader())
8401       return tryToBlend(Phi, Plan);
8402     if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan)))
8403       return Recipe;
8404 
8405     if (Legal->isReductionVariable(Phi)) {
8406       RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8407       VPValue *StartV =
8408           Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue());
8409       return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV);
8410     }
8411 
8412     return new VPWidenPHIRecipe(Phi);
8413   }
8414 
8415   if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8416                                     cast<TruncInst>(Instr), Range, *Plan)))
8417     return Recipe;
8418 
8419   if (!shouldWiden(Instr, Range))
8420     return nullptr;
8421 
8422   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8423     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
8424                                 OrigLoop);
8425 
8426   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8427     bool InvariantCond =
8428         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8429     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
8430                                    InvariantCond);
8431   }
8432 
8433   return tryToWiden(Instr, *Plan);
8434 }
8435 
8436 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8437                                                         ElementCount MaxVF) {
8438   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8439 
8440   // Collect instructions from the original loop that will become trivially dead
8441   // in the vectorized loop. We don't need to vectorize these instructions. For
8442   // example, original induction update instructions can become dead because we
8443   // separately emit induction "steps" when generating code for the new loop.
8444   // Similarly, we create a new latch condition when setting up the structure
8445   // of the new loop, so the old one can become dead.
8446   SmallPtrSet<Instruction *, 4> DeadInstructions;
8447   collectTriviallyDeadInstructions(DeadInstructions);
8448 
8449   // Add assume instructions we need to drop to DeadInstructions, to prevent
8450   // them from being added to the VPlan.
8451   // TODO: We only need to drop assumes in blocks that get flattend. If the
8452   // control flow is preserved, we should keep them.
8453   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8454   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8455 
8456   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8457   // Dead instructions do not need sinking. Remove them from SinkAfter.
8458   for (Instruction *I : DeadInstructions)
8459     SinkAfter.erase(I);
8460 
8461   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8462   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8463     VFRange SubRange = {VF, MaxVFPlusOne};
8464     VPlans.push_back(
8465         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8466     VF = SubRange.End;
8467   }
8468 }
8469 
8470 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8471     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8472     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
8473 
8474   // Hold a mapping from predicated instructions to their recipes, in order to
8475   // fix their AlsoPack behavior if a user is determined to replicate and use a
8476   // scalar instead of vector value.
8477   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
8478 
8479   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8480 
8481   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8482 
8483   // ---------------------------------------------------------------------------
8484   // Pre-construction: record ingredients whose recipes we'll need to further
8485   // process after constructing the initial VPlan.
8486   // ---------------------------------------------------------------------------
8487 
8488   // Mark instructions we'll need to sink later and their targets as
8489   // ingredients whose recipe we'll need to record.
8490   for (auto &Entry : SinkAfter) {
8491     RecipeBuilder.recordRecipeOf(Entry.first);
8492     RecipeBuilder.recordRecipeOf(Entry.second);
8493   }
8494   for (auto &Reduction : CM.getInLoopReductionChains()) {
8495     PHINode *Phi = Reduction.first;
8496     RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
8497     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8498 
8499     RecipeBuilder.recordRecipeOf(Phi);
8500     for (auto &R : ReductionOperations) {
8501       RecipeBuilder.recordRecipeOf(R);
8502       // For min/max reducitons, where we have a pair of icmp/select, we also
8503       // need to record the ICmp recipe, so it can be removed later.
8504       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8505         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8506     }
8507   }
8508 
8509   // For each interleave group which is relevant for this (possibly trimmed)
8510   // Range, add it to the set of groups to be later applied to the VPlan and add
8511   // placeholders for its members' Recipes which we'll be replacing with a
8512   // single VPInterleaveRecipe.
8513   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8514     auto applyIG = [IG, this](ElementCount VF) -> bool {
8515       return (VF.isVector() && // Query is illegal for VF == 1
8516               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8517                   LoopVectorizationCostModel::CM_Interleave);
8518     };
8519     if (!getDecisionAndClampRange(applyIG, Range))
8520       continue;
8521     InterleaveGroups.insert(IG);
8522     for (unsigned i = 0; i < IG->getFactor(); i++)
8523       if (Instruction *Member = IG->getMember(i))
8524         RecipeBuilder.recordRecipeOf(Member);
8525   };
8526 
8527   // ---------------------------------------------------------------------------
8528   // Build initial VPlan: Scan the body of the loop in a topological order to
8529   // visit each basic block after having visited its predecessor basic blocks.
8530   // ---------------------------------------------------------------------------
8531 
8532   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
8533   auto Plan = std::make_unique<VPlan>();
8534   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
8535   Plan->setEntry(VPBB);
8536 
8537   // Scan the body of the loop in a topological order to visit each basic block
8538   // after having visited its predecessor basic blocks.
8539   LoopBlocksDFS DFS(OrigLoop);
8540   DFS.perform(LI);
8541 
8542   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8543     // Relevant instructions from basic block BB will be grouped into VPRecipe
8544     // ingredients and fill a new VPBasicBlock.
8545     unsigned VPBBsForBB = 0;
8546     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8547     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
8548     VPBB = FirstVPBBForBB;
8549     Builder.setInsertPoint(VPBB);
8550 
8551     // Introduce each ingredient into VPlan.
8552     // TODO: Model and preserve debug instrinsics in VPlan.
8553     for (Instruction &I : BB->instructionsWithoutDebug()) {
8554       Instruction *Instr = &I;
8555 
8556       // First filter out irrelevant instructions, to ensure no recipes are
8557       // built for them.
8558       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8559         continue;
8560 
8561       if (auto Recipe =
8562               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
8563         for (auto *Def : Recipe->definedValues()) {
8564           auto *UV = Def->getUnderlyingValue();
8565           Plan->addVPValue(UV, Def);
8566         }
8567 
8568         RecipeBuilder.setRecipe(Instr, Recipe);
8569         VPBB->appendRecipe(Recipe);
8570         continue;
8571       }
8572 
8573       // Otherwise, if all widening options failed, Instruction is to be
8574       // replicated. This may create a successor for VPBB.
8575       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
8576           Instr, Range, VPBB, PredInst2Recipe, Plan);
8577       if (NextVPBB != VPBB) {
8578         VPBB = NextVPBB;
8579         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8580                                     : "");
8581       }
8582     }
8583   }
8584 
8585   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8586   // may also be empty, such as the last one VPBB, reflecting original
8587   // basic-blocks with no recipes.
8588   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8589   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
8590   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8591   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
8592   delete PreEntry;
8593 
8594   // ---------------------------------------------------------------------------
8595   // Transform initial VPlan: Apply previously taken decisions, in order, to
8596   // bring the VPlan to its final state.
8597   // ---------------------------------------------------------------------------
8598 
8599   // Apply Sink-After legal constraints.
8600   for (auto &Entry : SinkAfter) {
8601     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8602     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8603     // If the target is in a replication region, make sure to move Sink to the
8604     // block after it, not into the replication region itself.
8605     if (auto *Region =
8606             dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) {
8607       if (Region->isReplicator()) {
8608         assert(Region->getNumSuccessors() == 1 && "Expected SESE region!");
8609         VPBasicBlock *NextBlock =
8610             cast<VPBasicBlock>(Region->getSuccessors().front());
8611         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
8612         continue;
8613       }
8614     }
8615     Sink->moveAfter(Target);
8616   }
8617 
8618   // Interleave memory: for each Interleave Group we marked earlier as relevant
8619   // for this VPlan, replace the Recipes widening its memory instructions with a
8620   // single VPInterleaveRecipe at its insertion point.
8621   for (auto IG : InterleaveGroups) {
8622     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8623         RecipeBuilder.getRecipe(IG->getInsertPos()));
8624     SmallVector<VPValue *, 4> StoredValues;
8625     for (unsigned i = 0; i < IG->getFactor(); ++i)
8626       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
8627         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
8628 
8629     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8630                                         Recipe->getMask());
8631     VPIG->insertBefore(Recipe);
8632     unsigned J = 0;
8633     for (unsigned i = 0; i < IG->getFactor(); ++i)
8634       if (Instruction *Member = IG->getMember(i)) {
8635         if (!Member->getType()->isVoidTy()) {
8636           VPValue *OriginalV = Plan->getVPValue(Member);
8637           Plan->removeVPValueFor(Member);
8638           Plan->addVPValue(Member, VPIG->getVPValue(J));
8639           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8640           J++;
8641         }
8642         RecipeBuilder.getRecipe(Member)->eraseFromParent();
8643       }
8644   }
8645 
8646   // Adjust the recipes for any inloop reductions.
8647   if (Range.Start.isVector())
8648     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
8649 
8650   // Finally, if tail is folded by masking, introduce selects between the phi
8651   // and the live-out instruction of each reduction, at the end of the latch.
8652   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
8653     Builder.setInsertPoint(VPBB);
8654     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
8655     for (auto &Reduction : Legal->getReductionVars()) {
8656       if (CM.isInLoopReduction(Reduction.first))
8657         continue;
8658       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
8659       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
8660       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
8661     }
8662   }
8663 
8664   std::string PlanName;
8665   raw_string_ostream RSO(PlanName);
8666   ElementCount VF = Range.Start;
8667   Plan->addVF(VF);
8668   RSO << "Initial VPlan for VF={" << VF;
8669   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
8670     Plan->addVF(VF);
8671     RSO << "," << VF;
8672   }
8673   RSO << "},UF>=1";
8674   RSO.flush();
8675   Plan->setName(PlanName);
8676 
8677   return Plan;
8678 }
8679 
8680 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8681   // Outer loop handling: They may require CFG and instruction level
8682   // transformations before even evaluating whether vectorization is profitable.
8683   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8684   // the vectorization pipeline.
8685   assert(!OrigLoop->isInnermost());
8686   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8687 
8688   // Create new empty VPlan
8689   auto Plan = std::make_unique<VPlan>();
8690 
8691   // Build hierarchical CFG
8692   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8693   HCFGBuilder.buildHierarchicalCFG();
8694 
8695   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
8696        VF *= 2)
8697     Plan->addVF(VF);
8698 
8699   if (EnableVPlanPredication) {
8700     VPlanPredicator VPP(*Plan);
8701     VPP.predicate();
8702 
8703     // Avoid running transformation to recipes until masked code generation in
8704     // VPlan-native path is in place.
8705     return Plan;
8706   }
8707 
8708   SmallPtrSet<Instruction *, 1> DeadInstructions;
8709   VPlanTransforms::VPInstructionsToVPRecipes(
8710       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
8711   return Plan;
8712 }
8713 
8714 // Adjust the recipes for any inloop reductions. The chain of instructions
8715 // leading from the loop exit instr to the phi need to be converted to
8716 // reductions, with one operand being vector and the other being the scalar
8717 // reduction chain.
8718 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
8719     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
8720   for (auto &Reduction : CM.getInLoopReductionChains()) {
8721     PHINode *Phi = Reduction.first;
8722     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8723     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8724 
8725     // ReductionOperations are orders top-down from the phi's use to the
8726     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
8727     // which of the two operands will remain scalar and which will be reduced.
8728     // For minmax the chain will be the select instructions.
8729     Instruction *Chain = Phi;
8730     for (Instruction *R : ReductionOperations) {
8731       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
8732       RecurKind Kind = RdxDesc.getRecurrenceKind();
8733 
8734       VPValue *ChainOp = Plan->getVPValue(Chain);
8735       unsigned FirstOpId;
8736       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8737         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
8738                "Expected to replace a VPWidenSelectSC");
8739         FirstOpId = 1;
8740       } else {
8741         assert(isa<VPWidenRecipe>(WidenRecipe) &&
8742                "Expected to replace a VPWidenSC");
8743         FirstOpId = 0;
8744       }
8745       unsigned VecOpId =
8746           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
8747       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
8748 
8749       auto *CondOp = CM.foldTailByMasking()
8750                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
8751                          : nullptr;
8752       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
8753           &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
8754       WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
8755       Plan->removeVPValueFor(R);
8756       Plan->addVPValue(R, RedRecipe);
8757       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
8758       WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
8759       WidenRecipe->eraseFromParent();
8760 
8761       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8762         VPRecipeBase *CompareRecipe =
8763             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
8764         assert(isa<VPWidenRecipe>(CompareRecipe) &&
8765                "Expected to replace a VPWidenSC");
8766         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
8767                "Expected no remaining users");
8768         CompareRecipe->eraseFromParent();
8769       }
8770       Chain = R;
8771     }
8772   }
8773 }
8774 
8775 Value* LoopVectorizationPlanner::VPCallbackILV::
8776 getOrCreateVectorValues(Value *V, unsigned Part) {
8777       return ILV.getOrCreateVectorValue(V, Part);
8778 }
8779 
8780 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
8781     Value *V, const VPIteration &Instance) {
8782   return ILV.getOrCreateScalarValue(V, Instance);
8783 }
8784 
8785 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
8786                                VPSlotTracker &SlotTracker) const {
8787   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
8788   IG->getInsertPos()->printAsOperand(O, false);
8789   O << ", ";
8790   getAddr()->printAsOperand(O, SlotTracker);
8791   VPValue *Mask = getMask();
8792   if (Mask) {
8793     O << ", ";
8794     Mask->printAsOperand(O, SlotTracker);
8795   }
8796   for (unsigned i = 0; i < IG->getFactor(); ++i)
8797     if (Instruction *I = IG->getMember(i))
8798       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
8799 }
8800 
8801 void VPWidenCallRecipe::execute(VPTransformState &State) {
8802   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
8803                                   *this, State);
8804 }
8805 
8806 void VPWidenSelectRecipe::execute(VPTransformState &State) {
8807   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
8808                                     this, *this, InvariantCond, State);
8809 }
8810 
8811 void VPWidenRecipe::execute(VPTransformState &State) {
8812   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
8813 }
8814 
8815 void VPWidenGEPRecipe::execute(VPTransformState &State) {
8816   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
8817                       *this, State.UF, State.VF, IsPtrLoopInvariant,
8818                       IsIndexLoopInvariant, State);
8819 }
8820 
8821 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
8822   assert(!State.Instance && "Int or FP induction being replicated.");
8823   State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
8824                                    Trunc);
8825 }
8826 
8827 void VPWidenPHIRecipe::execute(VPTransformState &State) {
8828   Value *StartV =
8829       getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr;
8830   State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF);
8831 }
8832 
8833 void VPBlendRecipe::execute(VPTransformState &State) {
8834   State.ILV->setDebugLocFromInst(State.Builder, Phi);
8835   // We know that all PHIs in non-header blocks are converted into
8836   // selects, so we don't have to worry about the insertion order and we
8837   // can just use the builder.
8838   // At this point we generate the predication tree. There may be
8839   // duplications since this is a simple recursive scan, but future
8840   // optimizations will clean it up.
8841 
8842   unsigned NumIncoming = getNumIncomingValues();
8843 
8844   // Generate a sequence of selects of the form:
8845   // SELECT(Mask3, In3,
8846   //        SELECT(Mask2, In2,
8847   //               SELECT(Mask1, In1,
8848   //                      In0)))
8849   // Note that Mask0 is never used: lanes for which no path reaches this phi and
8850   // are essentially undef are taken from In0.
8851   InnerLoopVectorizer::VectorParts Entry(State.UF);
8852   for (unsigned In = 0; In < NumIncoming; ++In) {
8853     for (unsigned Part = 0; Part < State.UF; ++Part) {
8854       // We might have single edge PHIs (blocks) - use an identity
8855       // 'select' for the first PHI operand.
8856       Value *In0 = State.get(getIncomingValue(In), Part);
8857       if (In == 0)
8858         Entry[Part] = In0; // Initialize with the first incoming value.
8859       else {
8860         // Select between the current value and the previous incoming edge
8861         // based on the incoming mask.
8862         Value *Cond = State.get(getMask(In), Part);
8863         Entry[Part] =
8864             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8865       }
8866     }
8867   }
8868   for (unsigned Part = 0; Part < State.UF; ++Part)
8869     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8870 }
8871 
8872 void VPInterleaveRecipe::execute(VPTransformState &State) {
8873   assert(!State.Instance && "Interleave group being replicated.");
8874   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
8875                                       getStoredValues(), getMask());
8876 }
8877 
8878 void VPReductionRecipe::execute(VPTransformState &State) {
8879   assert(!State.Instance && "Reduction being replicated.");
8880   for (unsigned Part = 0; Part < State.UF; ++Part) {
8881     RecurKind Kind = RdxDesc->getRecurrenceKind();
8882     Value *NewVecOp = State.get(getVecOp(), Part);
8883     if (VPValue *Cond = getCondOp()) {
8884       Value *NewCond = State.get(Cond, Part);
8885       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
8886       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
8887           Kind, VecTy->getElementType());
8888       Constant *IdenVec =
8889           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
8890       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
8891       NewVecOp = Select;
8892     }
8893     Value *NewRed =
8894         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
8895     Value *PrevInChain = State.get(getChainOp(), Part);
8896     Value *NextInChain;
8897     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8898       NextInChain =
8899           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
8900                          NewRed, PrevInChain);
8901     } else {
8902       NextInChain = State.Builder.CreateBinOp(
8903           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
8904           PrevInChain);
8905     }
8906     State.set(this, getUnderlyingInstr(), NextInChain, Part);
8907   }
8908 }
8909 
8910 void VPReplicateRecipe::execute(VPTransformState &State) {
8911   if (State.Instance) { // Generate a single instance.
8912     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
8913     State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
8914                                     *State.Instance, IsPredicated, State);
8915     // Insert scalar instance packing it into a vector.
8916     if (AlsoPack && State.VF.isVector()) {
8917       // If we're constructing lane 0, initialize to start from poison.
8918       if (State.Instance->Lane == 0) {
8919         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8920         Value *Poison = PoisonValue::get(
8921             VectorType::get(getUnderlyingValue()->getType(), State.VF));
8922         State.ValueMap.setVectorValue(getUnderlyingInstr(),
8923                                       State.Instance->Part, Poison);
8924       }
8925       State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
8926                                            *State.Instance);
8927     }
8928     return;
8929   }
8930 
8931   // Generate scalar instances for all VF lanes of all UF parts, unless the
8932   // instruction is uniform inwhich case generate only the first lane for each
8933   // of the UF parts.
8934   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8935   assert((!State.VF.isScalable() || IsUniform) &&
8936          "Can't scalarize a scalable vector");
8937   for (unsigned Part = 0; Part < State.UF; ++Part)
8938     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8939       State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
8940                                       IsPredicated, State);
8941 }
8942 
8943 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8944   assert(State.Instance && "Branch on Mask works only on single instance.");
8945 
8946   unsigned Part = State.Instance->Part;
8947   unsigned Lane = State.Instance->Lane;
8948 
8949   Value *ConditionBit = nullptr;
8950   VPValue *BlockInMask = getMask();
8951   if (BlockInMask) {
8952     ConditionBit = State.get(BlockInMask, Part);
8953     if (ConditionBit->getType()->isVectorTy())
8954       ConditionBit = State.Builder.CreateExtractElement(
8955           ConditionBit, State.Builder.getInt32(Lane));
8956   } else // Block in mask is all-one.
8957     ConditionBit = State.Builder.getTrue();
8958 
8959   // Replace the temporary unreachable terminator with a new conditional branch,
8960   // whose two destinations will be set later when they are created.
8961   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8962   assert(isa<UnreachableInst>(CurrentTerminator) &&
8963          "Expected to replace unreachable terminator with conditional branch.");
8964   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8965   CondBr->setSuccessor(0, nullptr);
8966   ReplaceInstWithInst(CurrentTerminator, CondBr);
8967 }
8968 
8969 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8970   assert(State.Instance && "Predicated instruction PHI works per instance.");
8971   Instruction *ScalarPredInst =
8972       cast<Instruction>(State.get(getOperand(0), *State.Instance));
8973   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8974   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8975   assert(PredicatingBB && "Predicated block has no single predecessor.");
8976 
8977   // By current pack/unpack logic we need to generate only a single phi node: if
8978   // a vector value for the predicated instruction exists at this point it means
8979   // the instruction has vector users only, and a phi for the vector value is
8980   // needed. In this case the recipe of the predicated instruction is marked to
8981   // also do that packing, thereby "hoisting" the insert-element sequence.
8982   // Otherwise, a phi node for the scalar value is needed.
8983   unsigned Part = State.Instance->Part;
8984   Instruction *PredInst =
8985       cast<Instruction>(getOperand(0)->getUnderlyingValue());
8986   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8987     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8988     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8989     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8990     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8991     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8992     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8993   } else {
8994     Type *PredInstType = PredInst->getType();
8995     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8996     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB);
8997     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8998     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8999   }
9000 }
9001 
9002 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9003   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9004   State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
9005                                         StoredValue ? nullptr : getVPValue(),
9006                                         getAddr(), StoredValue, getMask());
9007 }
9008 
9009 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9010 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9011 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9012 // for predication.
9013 static ScalarEpilogueLowering getScalarEpilogueLowering(
9014     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9015     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9016     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9017     LoopVectorizationLegality &LVL) {
9018   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9019   // don't look at hints or options, and don't request a scalar epilogue.
9020   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9021   // LoopAccessInfo (due to code dependency and not being able to reliably get
9022   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9023   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9024   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9025   // back to the old way and vectorize with versioning when forced. See D81345.)
9026   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9027                                                       PGSOQueryType::IRPass) &&
9028                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9029     return CM_ScalarEpilogueNotAllowedOptSize;
9030 
9031   // 2) If set, obey the directives
9032   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9033     switch (PreferPredicateOverEpilogue) {
9034     case PreferPredicateTy::ScalarEpilogue:
9035       return CM_ScalarEpilogueAllowed;
9036     case PreferPredicateTy::PredicateElseScalarEpilogue:
9037       return CM_ScalarEpilogueNotNeededUsePredicate;
9038     case PreferPredicateTy::PredicateOrDontVectorize:
9039       return CM_ScalarEpilogueNotAllowedUsePredicate;
9040     };
9041   }
9042 
9043   // 3) If set, obey the hints
9044   switch (Hints.getPredicate()) {
9045   case LoopVectorizeHints::FK_Enabled:
9046     return CM_ScalarEpilogueNotNeededUsePredicate;
9047   case LoopVectorizeHints::FK_Disabled:
9048     return CM_ScalarEpilogueAllowed;
9049   };
9050 
9051   // 4) if the TTI hook indicates this is profitable, request predication.
9052   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
9053                                        LVL.getLAI()))
9054     return CM_ScalarEpilogueNotNeededUsePredicate;
9055 
9056   return CM_ScalarEpilogueAllowed;
9057 }
9058 
9059 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
9060                            unsigned Part) {
9061   set(Def, V, Part);
9062   ILV->setVectorValue(IRDef, Part, V);
9063 }
9064 
9065 // Process the loop in the VPlan-native vectorization path. This path builds
9066 // VPlan upfront in the vectorization pipeline, which allows to apply
9067 // VPlan-to-VPlan transformations from the very beginning without modifying the
9068 // input LLVM IR.
9069 static bool processLoopInVPlanNativePath(
9070     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9071     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9072     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9073     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9074     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
9075 
9076   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9077     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9078     return false;
9079   }
9080   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9081   Function *F = L->getHeader()->getParent();
9082   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9083 
9084   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9085       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
9086 
9087   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9088                                 &Hints, IAI);
9089   // Use the planner for outer loop vectorization.
9090   // TODO: CM is not used at this point inside the planner. Turn CM into an
9091   // optional argument if we don't need it in the future.
9092   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
9093 
9094   // Get user vectorization factor.
9095   ElementCount UserVF = Hints.getWidth();
9096 
9097   // Plan how to best vectorize, return the best VF and its cost.
9098   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9099 
9100   // If we are stress testing VPlan builds, do not attempt to generate vector
9101   // code. Masked vector code generation support will follow soon.
9102   // Also, do not attempt to vectorize if no vector code will be produced.
9103   if (VPlanBuildStressTest || EnableVPlanPredication ||
9104       VectorizationFactor::Disabled() == VF)
9105     return false;
9106 
9107   LVP.setBestPlan(VF.Width, 1);
9108 
9109   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
9110                          &CM, BFI, PSI);
9111   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9112                     << L->getHeader()->getParent()->getName() << "\"\n");
9113   LVP.executePlan(LB, DT);
9114 
9115   // Mark the loop as already vectorized to avoid vectorizing again.
9116   Hints.setAlreadyVectorized();
9117 
9118   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9119   return true;
9120 }
9121 
9122 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9123     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9124                                !EnableLoopInterleaving),
9125       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9126                               !EnableLoopVectorization) {}
9127 
9128 bool LoopVectorizePass::processLoop(Loop *L) {
9129   assert((EnableVPlanNativePath || L->isInnermost()) &&
9130          "VPlan-native path is not enabled. Only process inner loops.");
9131 
9132 #ifndef NDEBUG
9133   const std::string DebugLocStr = getDebugLocString(L);
9134 #endif /* NDEBUG */
9135 
9136   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
9137                     << L->getHeader()->getParent()->getName() << "\" from "
9138                     << DebugLocStr << "\n");
9139 
9140   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
9141 
9142   LLVM_DEBUG(
9143       dbgs() << "LV: Loop hints:"
9144              << " force="
9145              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9146                      ? "disabled"
9147                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9148                             ? "enabled"
9149                             : "?"))
9150              << " width=" << Hints.getWidth()
9151              << " unroll=" << Hints.getInterleave() << "\n");
9152 
9153   // Function containing loop
9154   Function *F = L->getHeader()->getParent();
9155 
9156   // Looking at the diagnostic output is the only way to determine if a loop
9157   // was vectorized (other than looking at the IR or machine code), so it
9158   // is important to generate an optimization remark for each loop. Most of
9159   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9160   // generated as OptimizationRemark and OptimizationRemarkMissed are
9161   // less verbose reporting vectorized loops and unvectorized loops that may
9162   // benefit from vectorization, respectively.
9163 
9164   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9165     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9166     return false;
9167   }
9168 
9169   PredicatedScalarEvolution PSE(*SE, *L);
9170 
9171   // Check if it is legal to vectorize the loop.
9172   LoopVectorizationRequirements Requirements(*ORE);
9173   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
9174                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9175   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9176     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9177     Hints.emitRemarkWithHints();
9178     return false;
9179   }
9180 
9181   // Check the function attributes and profiles to find out if this function
9182   // should be optimized for size.
9183   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9184       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
9185 
9186   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9187   // here. They may require CFG and instruction level transformations before
9188   // even evaluating whether vectorization is profitable. Since we cannot modify
9189   // the incoming IR, we need to build VPlan upfront in the vectorization
9190   // pipeline.
9191   if (!L->isInnermost())
9192     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9193                                         ORE, BFI, PSI, Hints);
9194 
9195   assert(L->isInnermost() && "Inner loop expected.");
9196 
9197   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9198   // count by optimizing for size, to minimize overheads.
9199   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9200   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9201     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9202                       << "This loop is worth vectorizing only if no scalar "
9203                       << "iteration overheads are incurred.");
9204     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9205       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9206     else {
9207       LLVM_DEBUG(dbgs() << "\n");
9208       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9209     }
9210   }
9211 
9212   // Check the function attributes to see if implicit floats are allowed.
9213   // FIXME: This check doesn't seem possibly correct -- what if the loop is
9214   // an integer loop and the vector instructions selected are purely integer
9215   // vector instructions?
9216   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9217     reportVectorizationFailure(
9218         "Can't vectorize when the NoImplicitFloat attribute is used",
9219         "loop not vectorized due to NoImplicitFloat attribute",
9220         "NoImplicitFloat", ORE, L);
9221     Hints.emitRemarkWithHints();
9222     return false;
9223   }
9224 
9225   // Check if the target supports potentially unsafe FP vectorization.
9226   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9227   // for the target we're vectorizing for, to make sure none of the
9228   // additional fp-math flags can help.
9229   if (Hints.isPotentiallyUnsafe() &&
9230       TTI->isFPVectorizationPotentiallyUnsafe()) {
9231     reportVectorizationFailure(
9232         "Potentially unsafe FP op prevents vectorization",
9233         "loop not vectorized due to unsafe FP support.",
9234         "UnsafeFP", ORE, L);
9235     Hints.emitRemarkWithHints();
9236     return false;
9237   }
9238 
9239   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9240   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9241 
9242   // If an override option has been passed in for interleaved accesses, use it.
9243   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9244     UseInterleaved = EnableInterleavedMemAccesses;
9245 
9246   // Analyze interleaved memory accesses.
9247   if (UseInterleaved) {
9248     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9249   }
9250 
9251   // Use the cost model.
9252   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9253                                 F, &Hints, IAI);
9254   CM.collectValuesToIgnore();
9255 
9256   // Use the planner for vectorization.
9257   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
9258 
9259   // Get user vectorization factor and interleave count.
9260   ElementCount UserVF = Hints.getWidth();
9261   unsigned UserIC = Hints.getInterleave();
9262 
9263   // Plan how to best vectorize, return the best VF and its cost.
9264   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9265 
9266   VectorizationFactor VF = VectorizationFactor::Disabled();
9267   unsigned IC = 1;
9268 
9269   if (MaybeVF) {
9270     VF = *MaybeVF;
9271     // Select the interleave count.
9272     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9273   }
9274 
9275   // Identify the diagnostic messages that should be produced.
9276   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9277   bool VectorizeLoop = true, InterleaveLoop = true;
9278   if (Requirements.doesNotMeet(F, L, Hints)) {
9279     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
9280                          "requirements.\n");
9281     Hints.emitRemarkWithHints();
9282     return false;
9283   }
9284 
9285   if (VF.Width.isScalar()) {
9286     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9287     VecDiagMsg = std::make_pair(
9288         "VectorizationNotBeneficial",
9289         "the cost-model indicates that vectorization is not beneficial");
9290     VectorizeLoop = false;
9291   }
9292 
9293   if (!MaybeVF && UserIC > 1) {
9294     // Tell the user interleaving was avoided up-front, despite being explicitly
9295     // requested.
9296     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9297                          "interleaving should be avoided up front\n");
9298     IntDiagMsg = std::make_pair(
9299         "InterleavingAvoided",
9300         "Ignoring UserIC, because interleaving was avoided up front");
9301     InterleaveLoop = false;
9302   } else if (IC == 1 && UserIC <= 1) {
9303     // Tell the user interleaving is not beneficial.
9304     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9305     IntDiagMsg = std::make_pair(
9306         "InterleavingNotBeneficial",
9307         "the cost-model indicates that interleaving is not beneficial");
9308     InterleaveLoop = false;
9309     if (UserIC == 1) {
9310       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9311       IntDiagMsg.second +=
9312           " and is explicitly disabled or interleave count is set to 1";
9313     }
9314   } else if (IC > 1 && UserIC == 1) {
9315     // Tell the user interleaving is beneficial, but it explicitly disabled.
9316     LLVM_DEBUG(
9317         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9318     IntDiagMsg = std::make_pair(
9319         "InterleavingBeneficialButDisabled",
9320         "the cost-model indicates that interleaving is beneficial "
9321         "but is explicitly disabled or interleave count is set to 1");
9322     InterleaveLoop = false;
9323   }
9324 
9325   // Override IC if user provided an interleave count.
9326   IC = UserIC > 0 ? UserIC : IC;
9327 
9328   // Emit diagnostic messages, if any.
9329   const char *VAPassName = Hints.vectorizeAnalysisPassName();
9330   if (!VectorizeLoop && !InterleaveLoop) {
9331     // Do not vectorize or interleaving the loop.
9332     ORE->emit([&]() {
9333       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9334                                       L->getStartLoc(), L->getHeader())
9335              << VecDiagMsg.second;
9336     });
9337     ORE->emit([&]() {
9338       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9339                                       L->getStartLoc(), L->getHeader())
9340              << IntDiagMsg.second;
9341     });
9342     return false;
9343   } else if (!VectorizeLoop && InterleaveLoop) {
9344     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9345     ORE->emit([&]() {
9346       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9347                                         L->getStartLoc(), L->getHeader())
9348              << VecDiagMsg.second;
9349     });
9350   } else if (VectorizeLoop && !InterleaveLoop) {
9351     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9352                       << ") in " << DebugLocStr << '\n');
9353     ORE->emit([&]() {
9354       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9355                                         L->getStartLoc(), L->getHeader())
9356              << IntDiagMsg.second;
9357     });
9358   } else if (VectorizeLoop && InterleaveLoop) {
9359     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9360                       << ") in " << DebugLocStr << '\n');
9361     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9362   }
9363 
9364   LVP.setBestPlan(VF.Width, IC);
9365 
9366   using namespace ore;
9367   bool DisableRuntimeUnroll = false;
9368   MDNode *OrigLoopID = L->getLoopID();
9369 
9370   if (!VectorizeLoop) {
9371     assert(IC > 1 && "interleave count should not be 1 or 0");
9372     // If we decided that it is not legal to vectorize the loop, then
9373     // interleave it.
9374     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
9375                                BFI, PSI);
9376     LVP.executePlan(Unroller, DT);
9377 
9378     ORE->emit([&]() {
9379       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9380                                 L->getHeader())
9381              << "interleaved loop (interleaved count: "
9382              << NV("InterleaveCount", IC) << ")";
9383     });
9384   } else {
9385     // If we decided that it is *legal* to vectorize the loop, then do it.
9386 
9387     // Consider vectorizing the epilogue too if it's profitable.
9388     VectorizationFactor EpilogueVF =
9389       CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
9390     if (EpilogueVF.Width.isVector()) {
9391 
9392       // The first pass vectorizes the main loop and creates a scalar epilogue
9393       // to be vectorized by executing the plan (potentially with a different
9394       // factor) again shortly afterwards.
9395       EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
9396                                         EpilogueVF.Width.getKnownMinValue(), 1);
9397       EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
9398                                          &LVL, &CM, BFI, PSI);
9399 
9400       LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
9401       LVP.executePlan(MainILV, DT);
9402       ++LoopsVectorized;
9403 
9404       simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9405       formLCSSARecursively(*L, *DT, LI, SE);
9406 
9407       // Second pass vectorizes the epilogue and adjusts the control flow
9408       // edges from the first pass.
9409       LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
9410       EPI.MainLoopVF = EPI.EpilogueVF;
9411       EPI.MainLoopUF = EPI.EpilogueUF;
9412       EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9413                                                ORE, EPI, &LVL, &CM, BFI, PSI);
9414       LVP.executePlan(EpilogILV, DT);
9415       ++LoopsEpilogueVectorized;
9416 
9417       if (!MainILV.areSafetyChecksAdded())
9418         DisableRuntimeUnroll = true;
9419     } else {
9420       InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
9421                              &LVL, &CM, BFI, PSI);
9422       LVP.executePlan(LB, DT);
9423       ++LoopsVectorized;
9424 
9425       // Add metadata to disable runtime unrolling a scalar loop when there are
9426       // no runtime checks about strides and memory. A scalar loop that is
9427       // rarely used is not worth unrolling.
9428       if (!LB.areSafetyChecksAdded())
9429         DisableRuntimeUnroll = true;
9430     }
9431 
9432     // Report the vectorization decision.
9433     ORE->emit([&]() {
9434       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
9435                                 L->getHeader())
9436              << "vectorized loop (vectorization width: "
9437              << NV("VectorizationFactor", VF.Width)
9438              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
9439     });
9440   }
9441 
9442   Optional<MDNode *> RemainderLoopID =
9443       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
9444                                       LLVMLoopVectorizeFollowupEpilogue});
9445   if (RemainderLoopID.hasValue()) {
9446     L->setLoopID(RemainderLoopID.getValue());
9447   } else {
9448     if (DisableRuntimeUnroll)
9449       AddRuntimeUnrollDisableMetaData(L);
9450 
9451     // Mark the loop as already vectorized to avoid vectorizing again.
9452     Hints.setAlreadyVectorized();
9453   }
9454 
9455   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9456   return true;
9457 }
9458 
9459 LoopVectorizeResult LoopVectorizePass::runImpl(
9460     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
9461     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
9462     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
9463     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
9464     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
9465   SE = &SE_;
9466   LI = &LI_;
9467   TTI = &TTI_;
9468   DT = &DT_;
9469   BFI = &BFI_;
9470   TLI = TLI_;
9471   AA = &AA_;
9472   AC = &AC_;
9473   GetLAA = &GetLAA_;
9474   DB = &DB_;
9475   ORE = &ORE_;
9476   PSI = PSI_;
9477 
9478   // Don't attempt if
9479   // 1. the target claims to have no vector registers, and
9480   // 2. interleaving won't help ILP.
9481   //
9482   // The second condition is necessary because, even if the target has no
9483   // vector registers, loop vectorization may still enable scalar
9484   // interleaving.
9485   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9486       TTI->getMaxInterleaveFactor(1) < 2)
9487     return LoopVectorizeResult(false, false);
9488 
9489   bool Changed = false, CFGChanged = false;
9490 
9491   // The vectorizer requires loops to be in simplified form.
9492   // Since simplification may add new inner loops, it has to run before the
9493   // legality and profitability checks. This means running the loop vectorizer
9494   // will simplify all loops, regardless of whether anything end up being
9495   // vectorized.
9496   for (auto &L : *LI)
9497     Changed |= CFGChanged |=
9498         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9499 
9500   // Build up a worklist of inner-loops to vectorize. This is necessary as
9501   // the act of vectorizing or partially unrolling a loop creates new loops
9502   // and can invalidate iterators across the loops.
9503   SmallVector<Loop *, 8> Worklist;
9504 
9505   for (Loop *L : *LI)
9506     collectSupportedLoops(*L, LI, ORE, Worklist);
9507 
9508   LoopsAnalyzed += Worklist.size();
9509 
9510   // Now walk the identified inner loops.
9511   while (!Worklist.empty()) {
9512     Loop *L = Worklist.pop_back_val();
9513 
9514     // For the inner loops we actually process, form LCSSA to simplify the
9515     // transform.
9516     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
9517 
9518     Changed |= CFGChanged |= processLoop(L);
9519   }
9520 
9521   // Process each loop nest in the function.
9522   return LoopVectorizeResult(Changed, CFGChanged);
9523 }
9524 
9525 PreservedAnalyses LoopVectorizePass::run(Function &F,
9526                                          FunctionAnalysisManager &AM) {
9527     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
9528     auto &LI = AM.getResult<LoopAnalysis>(F);
9529     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
9530     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
9531     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
9532     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
9533     auto &AA = AM.getResult<AAManager>(F);
9534     auto &AC = AM.getResult<AssumptionAnalysis>(F);
9535     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
9536     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
9537     MemorySSA *MSSA = EnableMSSALoopDependency
9538                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
9539                           : nullptr;
9540 
9541     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
9542     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
9543         [&](Loop &L) -> const LoopAccessInfo & {
9544       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
9545                                         TLI, TTI, nullptr, MSSA};
9546       return LAM.getResult<LoopAccessAnalysis>(L, AR);
9547     };
9548     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
9549     ProfileSummaryInfo *PSI =
9550         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
9551     LoopVectorizeResult Result =
9552         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
9553     if (!Result.MadeAnyChange)
9554       return PreservedAnalyses::all();
9555     PreservedAnalyses PA;
9556 
9557     // We currently do not preserve loopinfo/dominator analyses with outer loop
9558     // vectorization. Until this is addressed, mark these analyses as preserved
9559     // only for non-VPlan-native path.
9560     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
9561     if (!EnableVPlanNativePath) {
9562       PA.preserve<LoopAnalysis>();
9563       PA.preserve<DominatorTreeAnalysis>();
9564     }
9565     PA.preserve<BasicAA>();
9566     PA.preserve<GlobalsAA>();
9567     if (!Result.MadeCFGChange)
9568       PA.preserveSet<CFGAnalyses>();
9569     return PA;
9570 }
9571