1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 #ifndef NDEBUG
161 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
162 #endif
163 
164 /// @{
165 /// Metadata attribute names
166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167 const char LLVMLoopVectorizeFollowupVectorized[] =
168     "llvm.loop.vectorize.followup_vectorized";
169 const char LLVMLoopVectorizeFollowupEpilogue[] =
170     "llvm.loop.vectorize.followup_epilogue";
171 /// @}
172 
173 STATISTIC(LoopsVectorized, "Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176 
177 static cl::opt<bool> EnableEpilogueVectorization(
178     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179     cl::desc("Enable vectorization of epilogue loops."));
180 
181 static cl::opt<unsigned> EpilogueVectorizationForceVF(
182     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183     cl::desc("When epilogue vectorization is enabled, and a value greater than "
184              "1 is specified, forces the given VF for all applicable epilogue "
185              "loops."));
186 
187 static cl::opt<unsigned> EpilogueVectorizationMinVF(
188     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189     cl::desc("Only loops with vectorization factor equal to or larger than "
190              "the specified value are considered for epilogue vectorization."));
191 
192 /// Loops with a known constant trip count below this number are vectorized only
193 /// if no scalar iteration overheads are incurred.
194 static cl::opt<unsigned> TinyTripCountVectorThreshold(
195     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196     cl::desc("Loops with a constant trip count that is smaller than this "
197              "value are vectorized only if no scalar iteration overheads "
198              "are incurred."));
199 
200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
201 // that predication is preferred, and this lists all options. I.e., the
202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
203 // and predicate the instructions accordingly. If tail-folding fails, there are
204 // different fallback strategies depending on these values:
205 namespace PreferPredicateTy {
206   enum Option {
207     ScalarEpilogue = 0,
208     PredicateElseScalarEpilogue,
209     PredicateOrDontVectorize
210   };
211 } // namespace PreferPredicateTy
212 
213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
214     "prefer-predicate-over-epilogue",
215     cl::init(PreferPredicateTy::ScalarEpilogue),
216     cl::Hidden,
217     cl::desc("Tail-folding and predication preferences over creating a scalar "
218              "epilogue loop."),
219     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
220                          "scalar-epilogue",
221                          "Don't tail-predicate loops, create scalar epilogue"),
222               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
223                          "predicate-else-scalar-epilogue",
224                          "prefer tail-folding, create scalar epilogue if tail "
225                          "folding fails."),
226               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
227                          "predicate-dont-vectorize",
228                          "prefers tail-folding, don't attempt vectorization if "
229                          "tail-folding fails.")));
230 
231 static cl::opt<bool> MaximizeBandwidth(
232     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
233     cl::desc("Maximize bandwidth when selecting vectorization factor which "
234              "will be determined by the smallest type in loop."));
235 
236 static cl::opt<bool> EnableInterleavedMemAccesses(
237     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
238     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
239 
240 /// An interleave-group may need masking if it resides in a block that needs
241 /// predication, or in order to mask away gaps.
242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
243     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
245 
246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
247     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
248     cl::desc("We don't interleave loops with a estimated constant trip count "
249              "below this number"));
250 
251 static cl::opt<unsigned> ForceTargetNumScalarRegs(
252     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
253     cl::desc("A flag that overrides the target's number of scalar registers."));
254 
255 static cl::opt<unsigned> ForceTargetNumVectorRegs(
256     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of vector registers."));
258 
259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
260     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's max interleave factor for "
262              "scalar loops."));
263 
264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
265     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
266     cl::desc("A flag that overrides the target's max interleave factor for "
267              "vectorized loops."));
268 
269 static cl::opt<unsigned> ForceTargetInstructionCost(
270     "force-target-instruction-cost", cl::init(0), cl::Hidden,
271     cl::desc("A flag that overrides the target's expected cost for "
272              "an instruction to a single constant value. Mostly "
273              "useful for getting consistent testing."));
274 
275 static cl::opt<unsigned> SmallLoopCost(
276     "small-loop-cost", cl::init(20), cl::Hidden,
277     cl::desc(
278         "The cost of a loop that is considered 'small' by the interleaver."));
279 
280 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
281     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
282     cl::desc("Enable the use of the block frequency analysis to access PGO "
283              "heuristics minimizing code growth in cold regions and being more "
284              "aggressive in hot regions."));
285 
286 // Runtime interleave loops for load/store throughput.
287 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
288     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
289     cl::desc(
290         "Enable runtime interleaving until load/store ports are saturated"));
291 
292 /// Interleave small loops with scalar reductions.
293 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
294     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
295     cl::desc("Enable interleaving for loops with small iteration counts that "
296              "contain scalar reductions to expose ILP."));
297 
298 /// The number of stores in a loop that are allowed to need predication.
299 static cl::opt<unsigned> NumberOfStoresToPredicate(
300     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
301     cl::desc("Max number of stores to be predicated behind an if."));
302 
303 static cl::opt<bool> EnableIndVarRegisterHeur(
304     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
305     cl::desc("Count the induction variable only once when interleaving"));
306 
307 static cl::opt<bool> EnableCondStoresVectorization(
308     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
309     cl::desc("Enable if predication of stores during vectorization."));
310 
311 static cl::opt<unsigned> MaxNestedScalarReductionIC(
312     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
313     cl::desc("The maximum interleave count to use when interleaving a scalar "
314              "reduction in a nested loop."));
315 
316 static cl::opt<bool>
317     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
318                            cl::Hidden,
319                            cl::desc("Prefer in-loop vector reductions, "
320                                     "overriding the targets preference."));
321 
322 static cl::opt<bool> PreferPredicatedReductionSelect(
323     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
324     cl::desc(
325         "Prefer predicating a reduction operation over an after loop select."));
326 
327 cl::opt<bool> EnableVPlanNativePath(
328     "enable-vplan-native-path", cl::init(false), cl::Hidden,
329     cl::desc("Enable VPlan-native vectorization path with "
330              "support for outer loop vectorization."));
331 
332 // FIXME: Remove this switch once we have divergence analysis. Currently we
333 // assume divergent non-backedge branches when this switch is true.
334 cl::opt<bool> EnableVPlanPredication(
335     "enable-vplan-predication", cl::init(false), cl::Hidden,
336     cl::desc("Enable VPlan-native vectorization path predicator with "
337              "support for outer loop vectorization."));
338 
339 // This flag enables the stress testing of the VPlan H-CFG construction in the
340 // VPlan-native vectorization path. It must be used in conjuction with
341 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
342 // verification of the H-CFGs built.
343 static cl::opt<bool> VPlanBuildStressTest(
344     "vplan-build-stress-test", cl::init(false), cl::Hidden,
345     cl::desc(
346         "Build VPlan for every supported loop nest in the function and bail "
347         "out right after the build (stress test the VPlan H-CFG construction "
348         "in the VPlan-native vectorization path)."));
349 
350 cl::opt<bool> llvm::EnableLoopInterleaving(
351     "interleave-loops", cl::init(true), cl::Hidden,
352     cl::desc("Enable loop interleaving in Loop vectorization passes"));
353 cl::opt<bool> llvm::EnableLoopVectorization(
354     "vectorize-loops", cl::init(true), cl::Hidden,
355     cl::desc("Run the Loop vectorization passes"));
356 
357 /// A helper function that returns the type of loaded or stored value.
358 static Type *getMemInstValueType(Value *I) {
359   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
360          "Expected Load or Store instruction");
361   if (auto *LI = dyn_cast<LoadInst>(I))
362     return LI->getType();
363   return cast<StoreInst>(I)->getValueOperand()->getType();
364 }
365 
366 /// A helper function that returns true if the given type is irregular. The
367 /// type is irregular if its allocated size doesn't equal the store size of an
368 /// element of the corresponding vector type at the given vectorization factor.
369 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
370   // Determine if an array of VF elements of type Ty is "bitcast compatible"
371   // with a <VF x Ty> vector.
372   if (VF.isVector()) {
373     auto *VectorTy = VectorType::get(Ty, VF);
374     return TypeSize::get(VF.getKnownMinValue() *
375                              DL.getTypeAllocSize(Ty).getFixedValue(),
376                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
377   }
378 
379   // If the vectorization factor is one, we just check if an array of type Ty
380   // requires padding between elements.
381   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
382 }
383 
384 /// A helper function that returns the reciprocal of the block probability of
385 /// predicated blocks. If we return X, we are assuming the predicated block
386 /// will execute once for every X iterations of the loop header.
387 ///
388 /// TODO: We should use actual block probability here, if available. Currently,
389 ///       we always assume predicated blocks have a 50% chance of executing.
390 static unsigned getReciprocalPredBlockProb() { return 2; }
391 
392 /// A helper function that adds a 'fast' flag to floating-point operations.
393 static Value *addFastMathFlag(Value *V) {
394   if (isa<FPMathOperator>(V))
395     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
396   return V;
397 }
398 
399 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
400   if (isa<FPMathOperator>(V))
401     cast<Instruction>(V)->setFastMathFlags(FMF);
402   return V;
403 }
404 
405 /// A helper function that returns an integer or floating-point constant with
406 /// value C.
407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
408   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
409                            : ConstantFP::get(Ty, C);
410 }
411 
412 /// Returns "best known" trip count for the specified loop \p L as defined by
413 /// the following procedure:
414 ///   1) Returns exact trip count if it is known.
415 ///   2) Returns expected trip count according to profile data if any.
416 ///   3) Returns upper bound estimate if it is known.
417 ///   4) Returns None if all of the above failed.
418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
419   // Check if exact trip count is known.
420   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
421     return ExpectedTC;
422 
423   // Check if there is an expected trip count available from profile data.
424   if (LoopVectorizeWithBlockFrequency)
425     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
426       return EstimatedTC;
427 
428   // Check if upper bound estimate is known.
429   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
430     return ExpectedTC;
431 
432   return None;
433 }
434 
435 namespace llvm {
436 
437 /// InnerLoopVectorizer vectorizes loops which contain only one basic
438 /// block to a specified vectorization factor (VF).
439 /// This class performs the widening of scalars into vectors, or multiple
440 /// scalars. This class also implements the following features:
441 /// * It inserts an epilogue loop for handling loops that don't have iteration
442 ///   counts that are known to be a multiple of the vectorization factor.
443 /// * It handles the code generation for reduction variables.
444 /// * Scalarization (implementation using scalars) of un-vectorizable
445 ///   instructions.
446 /// InnerLoopVectorizer does not perform any vectorization-legality
447 /// checks, and relies on the caller to check for the different legality
448 /// aspects. The InnerLoopVectorizer relies on the
449 /// LoopVectorizationLegality class to provide information about the induction
450 /// and reduction variables that were found to a given vectorization factor.
451 class InnerLoopVectorizer {
452 public:
453   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
454                       LoopInfo *LI, DominatorTree *DT,
455                       const TargetLibraryInfo *TLI,
456                       const TargetTransformInfo *TTI, AssumptionCache *AC,
457                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
458                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
459                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
460                       ProfileSummaryInfo *PSI)
461       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
462         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
463         Builder(PSE.getSE()->getContext()),
464         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
465         BFI(BFI), PSI(PSI) {
466     // Query this against the original loop and save it here because the profile
467     // of the original loop header may change as the transformation happens.
468     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
469         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
470   }
471 
472   virtual ~InnerLoopVectorizer() = default;
473 
474   /// Create a new empty loop that will contain vectorized instructions later
475   /// on, while the old loop will be used as the scalar remainder. Control flow
476   /// is generated around the vectorized (and scalar epilogue) loops consisting
477   /// of various checks and bypasses. Return the pre-header block of the new
478   /// loop.
479   /// In the case of epilogue vectorization, this function is overriden to
480   /// handle the more complex control flow around the loops.
481   virtual BasicBlock *createVectorizedLoopSkeleton();
482 
483   /// Widen a single instruction within the innermost loop.
484   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
485                         VPTransformState &State);
486 
487   /// Widen a single call instruction within the innermost loop.
488   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
489                             VPTransformState &State);
490 
491   /// Widen a single select instruction within the innermost loop.
492   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
493                               bool InvariantCond, VPTransformState &State);
494 
495   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
496   void fixVectorizedLoop();
497 
498   // Return true if any runtime check is added.
499   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
500 
501   /// A type for vectorized values in the new loop. Each value from the
502   /// original loop, when vectorized, is represented by UF vector values in the
503   /// new unrolled loop, where UF is the unroll factor.
504   using VectorParts = SmallVector<Value *, 2>;
505 
506   /// Vectorize a single GetElementPtrInst based on information gathered and
507   /// decisions taken during planning.
508   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
509                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
510                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
511 
512   /// Vectorize a single PHINode in a block. This method handles the induction
513   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
514   /// arbitrary length vectors.
515   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
516 
517   /// A helper function to scalarize a single Instruction in the innermost loop.
518   /// Generates a sequence of scalar instances for each lane between \p MinLane
519   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
520   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
521   /// Instr's operands.
522   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
523                             const VPIteration &Instance, bool IfPredicateInstr,
524                             VPTransformState &State);
525 
526   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
527   /// is provided, the integer induction variable will first be truncated to
528   /// the corresponding type.
529   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
530 
531   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
532   /// vector or scalar value on-demand if one is not yet available. When
533   /// vectorizing a loop, we visit the definition of an instruction before its
534   /// uses. When visiting the definition, we either vectorize or scalarize the
535   /// instruction, creating an entry for it in the corresponding map. (In some
536   /// cases, such as induction variables, we will create both vector and scalar
537   /// entries.) Then, as we encounter uses of the definition, we derive values
538   /// for each scalar or vector use unless such a value is already available.
539   /// For example, if we scalarize a definition and one of its uses is vector,
540   /// we build the required vector on-demand with an insertelement sequence
541   /// when visiting the use. Otherwise, if the use is scalar, we can use the
542   /// existing scalar definition.
543   ///
544   /// Return a value in the new loop corresponding to \p V from the original
545   /// loop at unroll index \p Part. If the value has already been vectorized,
546   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
547   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
548   /// a new vector value on-demand by inserting the scalar values into a vector
549   /// with an insertelement sequence. If the value has been neither vectorized
550   /// nor scalarized, it must be loop invariant, so we simply broadcast the
551   /// value into a vector.
552   Value *getOrCreateVectorValue(Value *V, unsigned Part);
553 
554   void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
555     VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
556   }
557 
558   /// Return a value in the new loop corresponding to \p V from the original
559   /// loop at unroll and vector indices \p Instance. If the value has been
560   /// vectorized but not scalarized, the necessary extractelement instruction
561   /// will be generated.
562   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
563 
564   /// Construct the vector value of a scalarized value \p V one lane at a time.
565   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
566 
567   /// Try to vectorize interleaved access group \p Group with the base address
568   /// given in \p Addr, optionally masking the vector operations if \p
569   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
570   /// values in the vectorized loop.
571   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
572                                 VPTransformState &State, VPValue *Addr,
573                                 ArrayRef<VPValue *> StoredValues,
574                                 VPValue *BlockInMask = nullptr);
575 
576   /// Vectorize Load and Store instructions with the base address given in \p
577   /// Addr, optionally masking the vector operations if \p BlockInMask is
578   /// non-null. Use \p State to translate given VPValues to IR values in the
579   /// vectorized loop.
580   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
581                                   VPValue *Def, VPValue *Addr,
582                                   VPValue *StoredValue, VPValue *BlockInMask);
583 
584   /// Set the debug location in the builder using the debug location in
585   /// the instruction.
586   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
587 
588   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
589   void fixNonInductionPHIs(void);
590 
591 protected:
592   friend class LoopVectorizationPlanner;
593 
594   /// A small list of PHINodes.
595   using PhiVector = SmallVector<PHINode *, 4>;
596 
597   /// A type for scalarized values in the new loop. Each value from the
598   /// original loop, when scalarized, is represented by UF x VF scalar values
599   /// in the new unrolled loop, where UF is the unroll factor and VF is the
600   /// vectorization factor.
601   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
602 
603   /// Set up the values of the IVs correctly when exiting the vector loop.
604   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
605                     Value *CountRoundDown, Value *EndValue,
606                     BasicBlock *MiddleBlock);
607 
608   /// Create a new induction variable inside L.
609   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
610                                    Value *Step, Instruction *DL);
611 
612   /// Handle all cross-iteration phis in the header.
613   void fixCrossIterationPHIs();
614 
615   /// Fix a first-order recurrence. This is the second phase of vectorizing
616   /// this phi node.
617   void fixFirstOrderRecurrence(PHINode *Phi);
618 
619   /// Fix a reduction cross-iteration phi. This is the second phase of
620   /// vectorizing this phi node.
621   void fixReduction(PHINode *Phi);
622 
623   /// Clear NSW/NUW flags from reduction instructions if necessary.
624   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
625 
626   /// The Loop exit block may have single value PHI nodes with some
627   /// incoming value. While vectorizing we only handled real values
628   /// that were defined inside the loop and we should have one value for
629   /// each predecessor of its parent basic block. See PR14725.
630   void fixLCSSAPHIs();
631 
632   /// Iteratively sink the scalarized operands of a predicated instruction into
633   /// the block that was created for it.
634   void sinkScalarOperands(Instruction *PredInst);
635 
636   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
637   /// represented as.
638   void truncateToMinimalBitwidths();
639 
640   /// Create a broadcast instruction. This method generates a broadcast
641   /// instruction (shuffle) for loop invariant values and for the induction
642   /// value. If this is the induction variable then we extend it to N, N+1, ...
643   /// this is needed because each iteration in the loop corresponds to a SIMD
644   /// element.
645   virtual Value *getBroadcastInstrs(Value *V);
646 
647   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
648   /// to each vector element of Val. The sequence starts at StartIndex.
649   /// \p Opcode is relevant for FP induction variable.
650   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
651                                Instruction::BinaryOps Opcode =
652                                Instruction::BinaryOpsEnd);
653 
654   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
655   /// variable on which to base the steps, \p Step is the size of the step, and
656   /// \p EntryVal is the value from the original loop that maps to the steps.
657   /// Note that \p EntryVal doesn't have to be an induction variable - it
658   /// can also be a truncate instruction.
659   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
660                         const InductionDescriptor &ID);
661 
662   /// Create a vector induction phi node based on an existing scalar one. \p
663   /// EntryVal is the value from the original loop that maps to the vector phi
664   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
665   /// truncate instruction, instead of widening the original IV, we widen a
666   /// version of the IV truncated to \p EntryVal's type.
667   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
668                                        Value *Step, Instruction *EntryVal);
669 
670   /// Returns true if an instruction \p I should be scalarized instead of
671   /// vectorized for the chosen vectorization factor.
672   bool shouldScalarizeInstruction(Instruction *I) const;
673 
674   /// Returns true if we should generate a scalar version of \p IV.
675   bool needsScalarInduction(Instruction *IV) const;
676 
677   /// If there is a cast involved in the induction variable \p ID, which should
678   /// be ignored in the vectorized loop body, this function records the
679   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
680   /// cast. We had already proved that the casted Phi is equal to the uncasted
681   /// Phi in the vectorized loop (under a runtime guard), and therefore
682   /// there is no need to vectorize the cast - the same value can be used in the
683   /// vector loop for both the Phi and the cast.
684   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
685   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
686   ///
687   /// \p EntryVal is the value from the original loop that maps to the vector
688   /// phi node and is used to distinguish what is the IV currently being
689   /// processed - original one (if \p EntryVal is a phi corresponding to the
690   /// original IV) or the "newly-created" one based on the proof mentioned above
691   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
692   /// latter case \p EntryVal is a TruncInst and we must not record anything for
693   /// that IV, but it's error-prone to expect callers of this routine to care
694   /// about that, hence this explicit parameter.
695   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
696                                              const Instruction *EntryVal,
697                                              Value *VectorLoopValue,
698                                              unsigned Part,
699                                              unsigned Lane = UINT_MAX);
700 
701   /// Generate a shuffle sequence that will reverse the vector Vec.
702   virtual Value *reverseVector(Value *Vec);
703 
704   /// Returns (and creates if needed) the original loop trip count.
705   Value *getOrCreateTripCount(Loop *NewLoop);
706 
707   /// Returns (and creates if needed) the trip count of the widened loop.
708   Value *getOrCreateVectorTripCount(Loop *NewLoop);
709 
710   /// Returns a bitcasted value to the requested vector type.
711   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
712   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
713                                 const DataLayout &DL);
714 
715   /// Emit a bypass check to see if the vector trip count is zero, including if
716   /// it overflows.
717   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
718 
719   /// Emit a bypass check to see if all of the SCEV assumptions we've
720   /// had to make are correct.
721   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
722 
723   /// Emit bypass checks to check any memory assumptions we may have made.
724   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
725 
726   /// Compute the transformed value of Index at offset StartValue using step
727   /// StepValue.
728   /// For integer induction, returns StartValue + Index * StepValue.
729   /// For pointer induction, returns StartValue[Index * StepValue].
730   /// FIXME: The newly created binary instructions should contain nsw/nuw
731   /// flags, which can be found from the original scalar operations.
732   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
733                               const DataLayout &DL,
734                               const InductionDescriptor &ID) const;
735 
736   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
737   /// vector loop preheader, middle block and scalar preheader. Also
738   /// allocate a loop object for the new vector loop and return it.
739   Loop *createVectorLoopSkeleton(StringRef Prefix);
740 
741   /// Create new phi nodes for the induction variables to resume iteration count
742   /// in the scalar epilogue, from where the vectorized loop left off (given by
743   /// \p VectorTripCount).
744   /// In cases where the loop skeleton is more complicated (eg. epilogue
745   /// vectorization) and the resume values can come from an additional bypass
746   /// block, the \p AdditionalBypass pair provides information about the bypass
747   /// block and the end value on the edge from bypass to this loop.
748   void createInductionResumeValues(
749       Loop *L, Value *VectorTripCount,
750       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
751 
752   /// Complete the loop skeleton by adding debug MDs, creating appropriate
753   /// conditional branches in the middle block, preparing the builder and
754   /// running the verifier. Take in the vector loop \p L as argument, and return
755   /// the preheader of the completed vector loop.
756   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
757 
758   /// Add additional metadata to \p To that was not present on \p Orig.
759   ///
760   /// Currently this is used to add the noalias annotations based on the
761   /// inserted memchecks.  Use this for instructions that are *cloned* into the
762   /// vector loop.
763   void addNewMetadata(Instruction *To, const Instruction *Orig);
764 
765   /// Add metadata from one instruction to another.
766   ///
767   /// This includes both the original MDs from \p From and additional ones (\see
768   /// addNewMetadata).  Use this for *newly created* instructions in the vector
769   /// loop.
770   void addMetadata(Instruction *To, Instruction *From);
771 
772   /// Similar to the previous function but it adds the metadata to a
773   /// vector of instructions.
774   void addMetadata(ArrayRef<Value *> To, Instruction *From);
775 
776   /// Allow subclasses to override and print debug traces before/after vplan
777   /// execution, when trace information is requested.
778   virtual void printDebugTracesAtStart(){};
779   virtual void printDebugTracesAtEnd(){};
780 
781   /// The original loop.
782   Loop *OrigLoop;
783 
784   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
785   /// dynamic knowledge to simplify SCEV expressions and converts them to a
786   /// more usable form.
787   PredicatedScalarEvolution &PSE;
788 
789   /// Loop Info.
790   LoopInfo *LI;
791 
792   /// Dominator Tree.
793   DominatorTree *DT;
794 
795   /// Alias Analysis.
796   AAResults *AA;
797 
798   /// Target Library Info.
799   const TargetLibraryInfo *TLI;
800 
801   /// Target Transform Info.
802   const TargetTransformInfo *TTI;
803 
804   /// Assumption Cache.
805   AssumptionCache *AC;
806 
807   /// Interface to emit optimization remarks.
808   OptimizationRemarkEmitter *ORE;
809 
810   /// LoopVersioning.  It's only set up (non-null) if memchecks were
811   /// used.
812   ///
813   /// This is currently only used to add no-alias metadata based on the
814   /// memchecks.  The actually versioning is performed manually.
815   std::unique_ptr<LoopVersioning> LVer;
816 
817   /// The vectorization SIMD factor to use. Each vector will have this many
818   /// vector elements.
819   ElementCount VF;
820 
821   /// The vectorization unroll factor to use. Each scalar is vectorized to this
822   /// many different vector instructions.
823   unsigned UF;
824 
825   /// The builder that we use
826   IRBuilder<> Builder;
827 
828   // --- Vectorization state ---
829 
830   /// The vector-loop preheader.
831   BasicBlock *LoopVectorPreHeader;
832 
833   /// The scalar-loop preheader.
834   BasicBlock *LoopScalarPreHeader;
835 
836   /// Middle Block between the vector and the scalar.
837   BasicBlock *LoopMiddleBlock;
838 
839   /// The ExitBlock of the scalar loop.
840   BasicBlock *LoopExitBlock;
841 
842   /// The vector loop body.
843   BasicBlock *LoopVectorBody;
844 
845   /// The scalar loop body.
846   BasicBlock *LoopScalarBody;
847 
848   /// A list of all bypass blocks. The first block is the entry of the loop.
849   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
850 
851   /// The new Induction variable which was added to the new block.
852   PHINode *Induction = nullptr;
853 
854   /// The induction variable of the old basic block.
855   PHINode *OldInduction = nullptr;
856 
857   /// Maps values from the original loop to their corresponding values in the
858   /// vectorized loop. A key value can map to either vector values, scalar
859   /// values or both kinds of values, depending on whether the key was
860   /// vectorized and scalarized.
861   VectorizerValueMap VectorLoopValueMap;
862 
863   /// Store instructions that were predicated.
864   SmallVector<Instruction *, 4> PredicatedInstructions;
865 
866   /// Trip count of the original loop.
867   Value *TripCount = nullptr;
868 
869   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
870   Value *VectorTripCount = nullptr;
871 
872   /// The legality analysis.
873   LoopVectorizationLegality *Legal;
874 
875   /// The profitablity analysis.
876   LoopVectorizationCostModel *Cost;
877 
878   // Record whether runtime checks are added.
879   bool AddedSafetyChecks = false;
880 
881   // Holds the end values for each induction variable. We save the end values
882   // so we can later fix-up the external users of the induction variables.
883   DenseMap<PHINode *, Value *> IVEndValues;
884 
885   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
886   // fixed up at the end of vector code generation.
887   SmallVector<PHINode *, 8> OrigPHIsToFix;
888 
889   /// BFI and PSI are used to check for profile guided size optimizations.
890   BlockFrequencyInfo *BFI;
891   ProfileSummaryInfo *PSI;
892 
893   // Whether this loop should be optimized for size based on profile guided size
894   // optimizatios.
895   bool OptForSizeBasedOnProfile;
896 };
897 
898 class InnerLoopUnroller : public InnerLoopVectorizer {
899 public:
900   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
901                     LoopInfo *LI, DominatorTree *DT,
902                     const TargetLibraryInfo *TLI,
903                     const TargetTransformInfo *TTI, AssumptionCache *AC,
904                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
905                     LoopVectorizationLegality *LVL,
906                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
907                     ProfileSummaryInfo *PSI)
908       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
909                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
910                             BFI, PSI) {}
911 
912 private:
913   Value *getBroadcastInstrs(Value *V) override;
914   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
915                        Instruction::BinaryOps Opcode =
916                        Instruction::BinaryOpsEnd) override;
917   Value *reverseVector(Value *Vec) override;
918 };
919 
920 /// Encapsulate information regarding vectorization of a loop and its epilogue.
921 /// This information is meant to be updated and used across two stages of
922 /// epilogue vectorization.
923 struct EpilogueLoopVectorizationInfo {
924   ElementCount MainLoopVF = ElementCount::getFixed(0);
925   unsigned MainLoopUF = 0;
926   ElementCount EpilogueVF = ElementCount::getFixed(0);
927   unsigned EpilogueUF = 0;
928   BasicBlock *MainLoopIterationCountCheck = nullptr;
929   BasicBlock *EpilogueIterationCountCheck = nullptr;
930   BasicBlock *SCEVSafetyCheck = nullptr;
931   BasicBlock *MemSafetyCheck = nullptr;
932   Value *TripCount = nullptr;
933   Value *VectorTripCount = nullptr;
934 
935   EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
936                                 unsigned EUF)
937       : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
938         EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
939     assert(EUF == 1 &&
940            "A high UF for the epilogue loop is likely not beneficial.");
941   }
942 };
943 
944 /// An extension of the inner loop vectorizer that creates a skeleton for a
945 /// vectorized loop that has its epilogue (residual) also vectorized.
946 /// The idea is to run the vplan on a given loop twice, firstly to setup the
947 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
948 /// from the first step and vectorize the epilogue.  This is achieved by
949 /// deriving two concrete strategy classes from this base class and invoking
950 /// them in succession from the loop vectorizer planner.
951 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
952 public:
953   InnerLoopAndEpilogueVectorizer(
954       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
955       DominatorTree *DT, const TargetLibraryInfo *TLI,
956       const TargetTransformInfo *TTI, AssumptionCache *AC,
957       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
958       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
959       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
960       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
961                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
962         EPI(EPI) {}
963 
964   // Override this function to handle the more complex control flow around the
965   // three loops.
966   BasicBlock *createVectorizedLoopSkeleton() final override {
967     return createEpilogueVectorizedLoopSkeleton();
968   }
969 
970   /// The interface for creating a vectorized skeleton using one of two
971   /// different strategies, each corresponding to one execution of the vplan
972   /// as described above.
973   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
974 
975   /// Holds and updates state information required to vectorize the main loop
976   /// and its epilogue in two separate passes. This setup helps us avoid
977   /// regenerating and recomputing runtime safety checks. It also helps us to
978   /// shorten the iteration-count-check path length for the cases where the
979   /// iteration count of the loop is so small that the main vector loop is
980   /// completely skipped.
981   EpilogueLoopVectorizationInfo &EPI;
982 };
983 
984 /// A specialized derived class of inner loop vectorizer that performs
985 /// vectorization of *main* loops in the process of vectorizing loops and their
986 /// epilogues.
987 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
988 public:
989   EpilogueVectorizerMainLoop(
990       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
991       DominatorTree *DT, const TargetLibraryInfo *TLI,
992       const TargetTransformInfo *TTI, AssumptionCache *AC,
993       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
994       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
995       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
996       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
997                                        EPI, LVL, CM, BFI, PSI) {}
998   /// Implements the interface for creating a vectorized skeleton using the
999   /// *main loop* strategy (ie the first pass of vplan execution).
1000   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1001 
1002 protected:
1003   /// Emits an iteration count bypass check once for the main loop (when \p
1004   /// ForEpilogue is false) and once for the epilogue loop (when \p
1005   /// ForEpilogue is true).
1006   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
1007                                              bool ForEpilogue);
1008   void printDebugTracesAtStart() override;
1009   void printDebugTracesAtEnd() override;
1010 };
1011 
1012 // A specialized derived class of inner loop vectorizer that performs
1013 // vectorization of *epilogue* loops in the process of vectorizing loops and
1014 // their epilogues.
1015 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
1016 public:
1017   EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
1018                     LoopInfo *LI, DominatorTree *DT,
1019                     const TargetLibraryInfo *TLI,
1020                     const TargetTransformInfo *TTI, AssumptionCache *AC,
1021                     OptimizationRemarkEmitter *ORE,
1022                     EpilogueLoopVectorizationInfo &EPI,
1023                     LoopVectorizationLegality *LVL,
1024                     llvm::LoopVectorizationCostModel *CM,
1025                     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
1026       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1027                                        EPI, LVL, CM, BFI, PSI) {}
1028   /// Implements the interface for creating a vectorized skeleton using the
1029   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1030   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1031 
1032 protected:
1033   /// Emits an iteration count bypass check after the main vector loop has
1034   /// finished to see if there are any iterations left to execute by either
1035   /// the vector epilogue or the scalar epilogue.
1036   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1037                                                       BasicBlock *Bypass,
1038                                                       BasicBlock *Insert);
1039   void printDebugTracesAtStart() override;
1040   void printDebugTracesAtEnd() override;
1041 };
1042 } // end namespace llvm
1043 
1044 /// Look for a meaningful debug location on the instruction or it's
1045 /// operands.
1046 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1047   if (!I)
1048     return I;
1049 
1050   DebugLoc Empty;
1051   if (I->getDebugLoc() != Empty)
1052     return I;
1053 
1054   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
1055     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
1056       if (OpInst->getDebugLoc() != Empty)
1057         return OpInst;
1058   }
1059 
1060   return I;
1061 }
1062 
1063 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1064   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1065     const DILocation *DIL = Inst->getDebugLoc();
1066     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1067         !isa<DbgInfoIntrinsic>(Inst)) {
1068       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1069       auto NewDIL =
1070           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1071       if (NewDIL)
1072         B.SetCurrentDebugLocation(NewDIL.getValue());
1073       else
1074         LLVM_DEBUG(dbgs()
1075                    << "Failed to create new discriminator: "
1076                    << DIL->getFilename() << " Line: " << DIL->getLine());
1077     }
1078     else
1079       B.SetCurrentDebugLocation(DIL);
1080   } else
1081     B.SetCurrentDebugLocation(DebugLoc());
1082 }
1083 
1084 /// Write a record \p DebugMsg about vectorization failure to the debug
1085 /// output stream. If \p I is passed, it is an instruction that prevents
1086 /// vectorization.
1087 #ifndef NDEBUG
1088 static void debugVectorizationFailure(const StringRef DebugMsg,
1089     Instruction *I) {
1090   dbgs() << "LV: Not vectorizing: " << DebugMsg;
1091   if (I != nullptr)
1092     dbgs() << " " << *I;
1093   else
1094     dbgs() << '.';
1095   dbgs() << '\n';
1096 }
1097 #endif
1098 
1099 /// Create an analysis remark that explains why vectorization failed
1100 ///
1101 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1102 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1103 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1104 /// the location of the remark.  \return the remark object that can be
1105 /// streamed to.
1106 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1107     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1108   Value *CodeRegion = TheLoop->getHeader();
1109   DebugLoc DL = TheLoop->getStartLoc();
1110 
1111   if (I) {
1112     CodeRegion = I->getParent();
1113     // If there is no debug location attached to the instruction, revert back to
1114     // using the loop's.
1115     if (I->getDebugLoc())
1116       DL = I->getDebugLoc();
1117   }
1118 
1119   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1120   R << "loop not vectorized: ";
1121   return R;
1122 }
1123 
1124 /// Return a value for Step multiplied by VF.
1125 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1126   assert(isa<ConstantInt>(Step) && "Expected an integer step");
1127   Constant *StepVal = ConstantInt::get(
1128       Step->getType(),
1129       cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1130   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1131 }
1132 
1133 namespace llvm {
1134 
1135 void reportVectorizationFailure(const StringRef DebugMsg,
1136     const StringRef OREMsg, const StringRef ORETag,
1137     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1138   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
1139   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1140   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1141                 ORETag, TheLoop, I) << OREMsg);
1142 }
1143 
1144 } // end namespace llvm
1145 
1146 #ifndef NDEBUG
1147 /// \return string containing a file name and a line # for the given loop.
1148 static std::string getDebugLocString(const Loop *L) {
1149   std::string Result;
1150   if (L) {
1151     raw_string_ostream OS(Result);
1152     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1153       LoopDbgLoc.print(OS);
1154     else
1155       // Just print the module name.
1156       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1157     OS.flush();
1158   }
1159   return Result;
1160 }
1161 #endif
1162 
1163 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1164                                          const Instruction *Orig) {
1165   // If the loop was versioned with memchecks, add the corresponding no-alias
1166   // metadata.
1167   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1168     LVer->annotateInstWithNoAlias(To, Orig);
1169 }
1170 
1171 void InnerLoopVectorizer::addMetadata(Instruction *To,
1172                                       Instruction *From) {
1173   propagateMetadata(To, From);
1174   addNewMetadata(To, From);
1175 }
1176 
1177 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1178                                       Instruction *From) {
1179   for (Value *V : To) {
1180     if (Instruction *I = dyn_cast<Instruction>(V))
1181       addMetadata(I, From);
1182   }
1183 }
1184 
1185 namespace llvm {
1186 
1187 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1188 // lowered.
1189 enum ScalarEpilogueLowering {
1190 
1191   // The default: allowing scalar epilogues.
1192   CM_ScalarEpilogueAllowed,
1193 
1194   // Vectorization with OptForSize: don't allow epilogues.
1195   CM_ScalarEpilogueNotAllowedOptSize,
1196 
1197   // A special case of vectorisation with OptForSize: loops with a very small
1198   // trip count are considered for vectorization under OptForSize, thereby
1199   // making sure the cost of their loop body is dominant, free of runtime
1200   // guards and scalar iteration overheads.
1201   CM_ScalarEpilogueNotAllowedLowTripLoop,
1202 
1203   // Loop hint predicate indicating an epilogue is undesired.
1204   CM_ScalarEpilogueNotNeededUsePredicate,
1205 
1206   // Directive indicating we must either tail fold or not vectorize
1207   CM_ScalarEpilogueNotAllowedUsePredicate
1208 };
1209 
1210 /// LoopVectorizationCostModel - estimates the expected speedups due to
1211 /// vectorization.
1212 /// In many cases vectorization is not profitable. This can happen because of
1213 /// a number of reasons. In this class we mainly attempt to predict the
1214 /// expected speedup/slowdowns due to the supported instruction set. We use the
1215 /// TargetTransformInfo to query the different backends for the cost of
1216 /// different operations.
1217 class LoopVectorizationCostModel {
1218 public:
1219   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1220                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1221                              LoopVectorizationLegality *Legal,
1222                              const TargetTransformInfo &TTI,
1223                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1224                              AssumptionCache *AC,
1225                              OptimizationRemarkEmitter *ORE, const Function *F,
1226                              const LoopVectorizeHints *Hints,
1227                              InterleavedAccessInfo &IAI)
1228       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1229         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1230         Hints(Hints), InterleaveInfo(IAI) {}
1231 
1232   /// \return An upper bound for the vectorization factor, or None if
1233   /// vectorization and interleaving should be avoided up front.
1234   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1235 
1236   /// \return True if runtime checks are required for vectorization, and false
1237   /// otherwise.
1238   bool runtimeChecksRequired();
1239 
1240   /// \return The most profitable vectorization factor and the cost of that VF.
1241   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1242   /// then this vectorization factor will be selected if vectorization is
1243   /// possible.
1244   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1245   VectorizationFactor
1246   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1247                                     const LoopVectorizationPlanner &LVP);
1248 
1249   /// Setup cost-based decisions for user vectorization factor.
1250   void selectUserVectorizationFactor(ElementCount UserVF) {
1251     collectUniformsAndScalars(UserVF);
1252     collectInstsToScalarize(UserVF);
1253   }
1254 
1255   /// \return The size (in bits) of the smallest and widest types in the code
1256   /// that needs to be vectorized. We ignore values that remain scalar such as
1257   /// 64 bit loop indices.
1258   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1259 
1260   /// \return The desired interleave count.
1261   /// If interleave count has been specified by metadata it will be returned.
1262   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1263   /// are the selected vectorization factor and the cost of the selected VF.
1264   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1265 
1266   /// Memory access instruction may be vectorized in more than one way.
1267   /// Form of instruction after vectorization depends on cost.
1268   /// This function takes cost-based decisions for Load/Store instructions
1269   /// and collects them in a map. This decisions map is used for building
1270   /// the lists of loop-uniform and loop-scalar instructions.
1271   /// The calculated cost is saved with widening decision in order to
1272   /// avoid redundant calculations.
1273   void setCostBasedWideningDecision(ElementCount VF);
1274 
1275   /// A struct that represents some properties of the register usage
1276   /// of a loop.
1277   struct RegisterUsage {
1278     /// Holds the number of loop invariant values that are used in the loop.
1279     /// The key is ClassID of target-provided register class.
1280     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1281     /// Holds the maximum number of concurrent live intervals in the loop.
1282     /// The key is ClassID of target-provided register class.
1283     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1284   };
1285 
1286   /// \return Returns information about the register usages of the loop for the
1287   /// given vectorization factors.
1288   SmallVector<RegisterUsage, 8>
1289   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1290 
1291   /// Collect values we want to ignore in the cost model.
1292   void collectValuesToIgnore();
1293 
1294   /// Split reductions into those that happen in the loop, and those that happen
1295   /// outside. In loop reductions are collected into InLoopReductionChains.
1296   void collectInLoopReductions();
1297 
1298   /// \returns The smallest bitwidth each instruction can be represented with.
1299   /// The vector equivalents of these instructions should be truncated to this
1300   /// type.
1301   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1302     return MinBWs;
1303   }
1304 
1305   /// \returns True if it is more profitable to scalarize instruction \p I for
1306   /// vectorization factor \p VF.
1307   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1308     assert(VF.isVector() &&
1309            "Profitable to scalarize relevant only for VF > 1.");
1310 
1311     // Cost model is not run in the VPlan-native path - return conservative
1312     // result until this changes.
1313     if (EnableVPlanNativePath)
1314       return false;
1315 
1316     auto Scalars = InstsToScalarize.find(VF);
1317     assert(Scalars != InstsToScalarize.end() &&
1318            "VF not yet analyzed for scalarization profitability");
1319     return Scalars->second.find(I) != Scalars->second.end();
1320   }
1321 
1322   /// Returns true if \p I is known to be uniform after vectorization.
1323   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1324     if (VF.isScalar())
1325       return true;
1326 
1327     // Cost model is not run in the VPlan-native path - return conservative
1328     // result until this changes.
1329     if (EnableVPlanNativePath)
1330       return false;
1331 
1332     auto UniformsPerVF = Uniforms.find(VF);
1333     assert(UniformsPerVF != Uniforms.end() &&
1334            "VF not yet analyzed for uniformity");
1335     return UniformsPerVF->second.count(I);
1336   }
1337 
1338   /// Returns true if \p I is known to be scalar after vectorization.
1339   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1340     if (VF.isScalar())
1341       return true;
1342 
1343     // Cost model is not run in the VPlan-native path - return conservative
1344     // result until this changes.
1345     if (EnableVPlanNativePath)
1346       return false;
1347 
1348     auto ScalarsPerVF = Scalars.find(VF);
1349     assert(ScalarsPerVF != Scalars.end() &&
1350            "Scalar values are not calculated for VF");
1351     return ScalarsPerVF->second.count(I);
1352   }
1353 
1354   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1355   /// for vectorization factor \p VF.
1356   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1357     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1358            !isProfitableToScalarize(I, VF) &&
1359            !isScalarAfterVectorization(I, VF);
1360   }
1361 
1362   /// Decision that was taken during cost calculation for memory instruction.
1363   enum InstWidening {
1364     CM_Unknown,
1365     CM_Widen,         // For consecutive accesses with stride +1.
1366     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1367     CM_Interleave,
1368     CM_GatherScatter,
1369     CM_Scalarize
1370   };
1371 
1372   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1373   /// instruction \p I and vector width \p VF.
1374   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1375                            unsigned Cost) {
1376     assert(VF.isVector() && "Expected VF >=2");
1377     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1378   }
1379 
1380   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1381   /// interleaving group \p Grp and vector width \p VF.
1382   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1383                            ElementCount VF, InstWidening W, unsigned Cost) {
1384     assert(VF.isVector() && "Expected VF >=2");
1385     /// Broadcast this decicion to all instructions inside the group.
1386     /// But the cost will be assigned to one instruction only.
1387     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1388       if (auto *I = Grp->getMember(i)) {
1389         if (Grp->getInsertPos() == I)
1390           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1391         else
1392           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1393       }
1394     }
1395   }
1396 
1397   /// Return the cost model decision for the given instruction \p I and vector
1398   /// width \p VF. Return CM_Unknown if this instruction did not pass
1399   /// through the cost modeling.
1400   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1401     assert(VF.isVector() && "Expected VF to be a vector VF");
1402     // Cost model is not run in the VPlan-native path - return conservative
1403     // result until this changes.
1404     if (EnableVPlanNativePath)
1405       return CM_GatherScatter;
1406 
1407     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1408     auto Itr = WideningDecisions.find(InstOnVF);
1409     if (Itr == WideningDecisions.end())
1410       return CM_Unknown;
1411     return Itr->second.first;
1412   }
1413 
1414   /// Return the vectorization cost for the given instruction \p I and vector
1415   /// width \p VF.
1416   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1417     assert(VF.isVector() && "Expected VF >=2");
1418     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1419     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1420            "The cost is not calculated");
1421     return WideningDecisions[InstOnVF].second;
1422   }
1423 
1424   /// Return True if instruction \p I is an optimizable truncate whose operand
1425   /// is an induction variable. Such a truncate will be removed by adding a new
1426   /// induction variable with the destination type.
1427   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1428     // If the instruction is not a truncate, return false.
1429     auto *Trunc = dyn_cast<TruncInst>(I);
1430     if (!Trunc)
1431       return false;
1432 
1433     // Get the source and destination types of the truncate.
1434     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1435     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1436 
1437     // If the truncate is free for the given types, return false. Replacing a
1438     // free truncate with an induction variable would add an induction variable
1439     // update instruction to each iteration of the loop. We exclude from this
1440     // check the primary induction variable since it will need an update
1441     // instruction regardless.
1442     Value *Op = Trunc->getOperand(0);
1443     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1444       return false;
1445 
1446     // If the truncated value is not an induction variable, return false.
1447     return Legal->isInductionPhi(Op);
1448   }
1449 
1450   /// Collects the instructions to scalarize for each predicated instruction in
1451   /// the loop.
1452   void collectInstsToScalarize(ElementCount VF);
1453 
1454   /// Collect Uniform and Scalar values for the given \p VF.
1455   /// The sets depend on CM decision for Load/Store instructions
1456   /// that may be vectorized as interleave, gather-scatter or scalarized.
1457   void collectUniformsAndScalars(ElementCount VF) {
1458     // Do the analysis once.
1459     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1460       return;
1461     setCostBasedWideningDecision(VF);
1462     collectLoopUniforms(VF);
1463     collectLoopScalars(VF);
1464   }
1465 
1466   /// Returns true if the target machine supports masked store operation
1467   /// for the given \p DataType and kind of access to \p Ptr.
1468   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1469     return Legal->isConsecutivePtr(Ptr) &&
1470            TTI.isLegalMaskedStore(DataType, Alignment);
1471   }
1472 
1473   /// Returns true if the target machine supports masked load operation
1474   /// for the given \p DataType and kind of access to \p Ptr.
1475   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1476     return Legal->isConsecutivePtr(Ptr) &&
1477            TTI.isLegalMaskedLoad(DataType, Alignment);
1478   }
1479 
1480   /// Returns true if the target machine supports masked scatter operation
1481   /// for the given \p DataType.
1482   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1483     return TTI.isLegalMaskedScatter(DataType, Alignment);
1484   }
1485 
1486   /// Returns true if the target machine supports masked gather operation
1487   /// for the given \p DataType.
1488   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1489     return TTI.isLegalMaskedGather(DataType, Alignment);
1490   }
1491 
1492   /// Returns true if the target machine can represent \p V as a masked gather
1493   /// or scatter operation.
1494   bool isLegalGatherOrScatter(Value *V) {
1495     bool LI = isa<LoadInst>(V);
1496     bool SI = isa<StoreInst>(V);
1497     if (!LI && !SI)
1498       return false;
1499     auto *Ty = getMemInstValueType(V);
1500     Align Align = getLoadStoreAlignment(V);
1501     return (LI && isLegalMaskedGather(Ty, Align)) ||
1502            (SI && isLegalMaskedScatter(Ty, Align));
1503   }
1504 
1505   /// Returns true if \p I is an instruction that will be scalarized with
1506   /// predication. Such instructions include conditional stores and
1507   /// instructions that may divide by zero.
1508   /// If a non-zero VF has been calculated, we check if I will be scalarized
1509   /// predication for that VF.
1510   bool isScalarWithPredication(Instruction *I,
1511                                ElementCount VF = ElementCount::getFixed(1));
1512 
1513   // Returns true if \p I is an instruction that will be predicated either
1514   // through scalar predication or masked load/store or masked gather/scatter.
1515   // Superset of instructions that return true for isScalarWithPredication.
1516   bool isPredicatedInst(Instruction *I) {
1517     if (!blockNeedsPredication(I->getParent()))
1518       return false;
1519     // Loads and stores that need some form of masked operation are predicated
1520     // instructions.
1521     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1522       return Legal->isMaskRequired(I);
1523     return isScalarWithPredication(I);
1524   }
1525 
1526   /// Returns true if \p I is a memory instruction with consecutive memory
1527   /// access that can be widened.
1528   bool
1529   memoryInstructionCanBeWidened(Instruction *I,
1530                                 ElementCount VF = ElementCount::getFixed(1));
1531 
1532   /// Returns true if \p I is a memory instruction in an interleaved-group
1533   /// of memory accesses that can be vectorized with wide vector loads/stores
1534   /// and shuffles.
1535   bool
1536   interleavedAccessCanBeWidened(Instruction *I,
1537                                 ElementCount VF = ElementCount::getFixed(1));
1538 
1539   /// Check if \p Instr belongs to any interleaved access group.
1540   bool isAccessInterleaved(Instruction *Instr) {
1541     return InterleaveInfo.isInterleaved(Instr);
1542   }
1543 
1544   /// Get the interleaved access group that \p Instr belongs to.
1545   const InterleaveGroup<Instruction> *
1546   getInterleavedAccessGroup(Instruction *Instr) {
1547     return InterleaveInfo.getInterleaveGroup(Instr);
1548   }
1549 
1550   /// Returns true if an interleaved group requires a scalar iteration
1551   /// to handle accesses with gaps, and there is nothing preventing us from
1552   /// creating a scalar epilogue.
1553   bool requiresScalarEpilogue() const {
1554     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1555   }
1556 
1557   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1558   /// loop hint annotation.
1559   bool isScalarEpilogueAllowed() const {
1560     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1561   }
1562 
1563   /// Returns true if all loop blocks should be masked to fold tail loop.
1564   bool foldTailByMasking() const { return FoldTailByMasking; }
1565 
1566   bool blockNeedsPredication(BasicBlock *BB) {
1567     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1568   }
1569 
1570   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1571   /// nodes to the chain of instructions representing the reductions. Uses a
1572   /// MapVector to ensure deterministic iteration order.
1573   using ReductionChainMap =
1574       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1575 
1576   /// Return the chain of instructions representing an inloop reduction.
1577   const ReductionChainMap &getInLoopReductionChains() const {
1578     return InLoopReductionChains;
1579   }
1580 
1581   /// Returns true if the Phi is part of an inloop reduction.
1582   bool isInLoopReduction(PHINode *Phi) const {
1583     return InLoopReductionChains.count(Phi);
1584   }
1585 
1586   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1587   /// with factor VF.  Return the cost of the instruction, including
1588   /// scalarization overhead if it's needed.
1589   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1590 
1591   /// Estimate cost of a call instruction CI if it were vectorized with factor
1592   /// VF. Return the cost of the instruction, including scalarization overhead
1593   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1594   /// scalarized -
1595   /// i.e. either vector version isn't available, or is too expensive.
1596   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1597                              bool &NeedToScalarize);
1598 
1599   /// Invalidates decisions already taken by the cost model.
1600   void invalidateCostModelingDecisions() {
1601     WideningDecisions.clear();
1602     Uniforms.clear();
1603     Scalars.clear();
1604   }
1605 
1606 private:
1607   unsigned NumPredStores = 0;
1608 
1609   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1610   /// than zero. One is returned if vectorization should best be avoided due
1611   /// to cost.
1612   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1613                                     ElementCount UserVF);
1614 
1615   /// The vectorization cost is a combination of the cost itself and a boolean
1616   /// indicating whether any of the contributing operations will actually
1617   /// operate on
1618   /// vector values after type legalization in the backend. If this latter value
1619   /// is
1620   /// false, then all operations will be scalarized (i.e. no vectorization has
1621   /// actually taken place).
1622   using VectorizationCostTy = std::pair<unsigned, bool>;
1623 
1624   /// Returns the expected execution cost. The unit of the cost does
1625   /// not matter because we use the 'cost' units to compare different
1626   /// vector widths. The cost that is returned is *not* normalized by
1627   /// the factor width.
1628   VectorizationCostTy expectedCost(ElementCount VF);
1629 
1630   /// Returns the execution time cost of an instruction for a given vector
1631   /// width. Vector width of one means scalar.
1632   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1633 
1634   /// The cost-computation logic from getInstructionCost which provides
1635   /// the vector type as an output parameter.
1636   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1637 
1638   /// Calculate vectorization cost of memory instruction \p I.
1639   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1640 
1641   /// The cost computation for scalarized memory instruction.
1642   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1643 
1644   /// The cost computation for interleaving group of memory instructions.
1645   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1646 
1647   /// The cost computation for Gather/Scatter instruction.
1648   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1649 
1650   /// The cost computation for widening instruction \p I with consecutive
1651   /// memory access.
1652   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1653 
1654   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1655   /// Load: scalar load + broadcast.
1656   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1657   /// element)
1658   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1659 
1660   /// Estimate the overhead of scalarizing an instruction. This is a
1661   /// convenience wrapper for the type-based getScalarizationOverhead API.
1662   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1663 
1664   /// Returns whether the instruction is a load or store and will be a emitted
1665   /// as a vector operation.
1666   bool isConsecutiveLoadOrStore(Instruction *I);
1667 
1668   /// Returns true if an artificially high cost for emulated masked memrefs
1669   /// should be used.
1670   bool useEmulatedMaskMemRefHack(Instruction *I);
1671 
1672   /// Map of scalar integer values to the smallest bitwidth they can be legally
1673   /// represented as. The vector equivalents of these values should be truncated
1674   /// to this type.
1675   MapVector<Instruction *, uint64_t> MinBWs;
1676 
1677   /// A type representing the costs for instructions if they were to be
1678   /// scalarized rather than vectorized. The entries are Instruction-Cost
1679   /// pairs.
1680   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1681 
1682   /// A set containing all BasicBlocks that are known to present after
1683   /// vectorization as a predicated block.
1684   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1685 
1686   /// Records whether it is allowed to have the original scalar loop execute at
1687   /// least once. This may be needed as a fallback loop in case runtime
1688   /// aliasing/dependence checks fail, or to handle the tail/remainder
1689   /// iterations when the trip count is unknown or doesn't divide by the VF,
1690   /// or as a peel-loop to handle gaps in interleave-groups.
1691   /// Under optsize and when the trip count is very small we don't allow any
1692   /// iterations to execute in the scalar loop.
1693   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1694 
1695   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1696   bool FoldTailByMasking = false;
1697 
1698   /// A map holding scalar costs for different vectorization factors. The
1699   /// presence of a cost for an instruction in the mapping indicates that the
1700   /// instruction will be scalarized when vectorizing with the associated
1701   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1702   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1703 
1704   /// Holds the instructions known to be uniform after vectorization.
1705   /// The data is collected per VF.
1706   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1707 
1708   /// Holds the instructions known to be scalar after vectorization.
1709   /// The data is collected per VF.
1710   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1711 
1712   /// Holds the instructions (address computations) that are forced to be
1713   /// scalarized.
1714   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1715 
1716   /// PHINodes of the reductions that should be expanded in-loop along with
1717   /// their associated chains of reduction operations, in program order from top
1718   /// (PHI) to bottom
1719   ReductionChainMap InLoopReductionChains;
1720 
1721   /// Returns the expected difference in cost from scalarizing the expression
1722   /// feeding a predicated instruction \p PredInst. The instructions to
1723   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1724   /// non-negative return value implies the expression will be scalarized.
1725   /// Currently, only single-use chains are considered for scalarization.
1726   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1727                               ElementCount VF);
1728 
1729   /// Collect the instructions that are uniform after vectorization. An
1730   /// instruction is uniform if we represent it with a single scalar value in
1731   /// the vectorized loop corresponding to each vector iteration. Examples of
1732   /// uniform instructions include pointer operands of consecutive or
1733   /// interleaved memory accesses. Note that although uniformity implies an
1734   /// instruction will be scalar, the reverse is not true. In general, a
1735   /// scalarized instruction will be represented by VF scalar values in the
1736   /// vectorized loop, each corresponding to an iteration of the original
1737   /// scalar loop.
1738   void collectLoopUniforms(ElementCount VF);
1739 
1740   /// Collect the instructions that are scalar after vectorization. An
1741   /// instruction is scalar if it is known to be uniform or will be scalarized
1742   /// during vectorization. Non-uniform scalarized instructions will be
1743   /// represented by VF values in the vectorized loop, each corresponding to an
1744   /// iteration of the original scalar loop.
1745   void collectLoopScalars(ElementCount VF);
1746 
1747   /// Keeps cost model vectorization decision and cost for instructions.
1748   /// Right now it is used for memory instructions only.
1749   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1750                                 std::pair<InstWidening, unsigned>>;
1751 
1752   DecisionList WideningDecisions;
1753 
1754   /// Returns true if \p V is expected to be vectorized and it needs to be
1755   /// extracted.
1756   bool needsExtract(Value *V, ElementCount VF) const {
1757     Instruction *I = dyn_cast<Instruction>(V);
1758     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1759         TheLoop->isLoopInvariant(I))
1760       return false;
1761 
1762     // Assume we can vectorize V (and hence we need extraction) if the
1763     // scalars are not computed yet. This can happen, because it is called
1764     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1765     // the scalars are collected. That should be a safe assumption in most
1766     // cases, because we check if the operands have vectorizable types
1767     // beforehand in LoopVectorizationLegality.
1768     return Scalars.find(VF) == Scalars.end() ||
1769            !isScalarAfterVectorization(I, VF);
1770   };
1771 
1772   /// Returns a range containing only operands needing to be extracted.
1773   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1774                                                    ElementCount VF) {
1775     return SmallVector<Value *, 4>(make_filter_range(
1776         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1777   }
1778 
1779   /// Determines if we have the infrastructure to vectorize loop \p L and its
1780   /// epilogue, assuming the main loop is vectorized by \p VF.
1781   bool isCandidateForEpilogueVectorization(const Loop &L,
1782                                            const ElementCount VF) const;
1783 
1784   /// Returns true if epilogue vectorization is considered profitable, and
1785   /// false otherwise.
1786   /// \p VF is the vectorization factor chosen for the original loop.
1787   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1788 
1789 public:
1790   /// The loop that we evaluate.
1791   Loop *TheLoop;
1792 
1793   /// Predicated scalar evolution analysis.
1794   PredicatedScalarEvolution &PSE;
1795 
1796   /// Loop Info analysis.
1797   LoopInfo *LI;
1798 
1799   /// Vectorization legality.
1800   LoopVectorizationLegality *Legal;
1801 
1802   /// Vector target information.
1803   const TargetTransformInfo &TTI;
1804 
1805   /// Target Library Info.
1806   const TargetLibraryInfo *TLI;
1807 
1808   /// Demanded bits analysis.
1809   DemandedBits *DB;
1810 
1811   /// Assumption cache.
1812   AssumptionCache *AC;
1813 
1814   /// Interface to emit optimization remarks.
1815   OptimizationRemarkEmitter *ORE;
1816 
1817   const Function *TheFunction;
1818 
1819   /// Loop Vectorize Hint.
1820   const LoopVectorizeHints *Hints;
1821 
1822   /// The interleave access information contains groups of interleaved accesses
1823   /// with the same stride and close to each other.
1824   InterleavedAccessInfo &InterleaveInfo;
1825 
1826   /// Values to ignore in the cost model.
1827   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1828 
1829   /// Values to ignore in the cost model when VF > 1.
1830   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1831 
1832   /// Profitable vector factors.
1833   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1834 };
1835 
1836 } // end namespace llvm
1837 
1838 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1839 // vectorization. The loop needs to be annotated with #pragma omp simd
1840 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1841 // vector length information is not provided, vectorization is not considered
1842 // explicit. Interleave hints are not allowed either. These limitations will be
1843 // relaxed in the future.
1844 // Please, note that we are currently forced to abuse the pragma 'clang
1845 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1846 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1847 // provides *explicit vectorization hints* (LV can bypass legal checks and
1848 // assume that vectorization is legal). However, both hints are implemented
1849 // using the same metadata (llvm.loop.vectorize, processed by
1850 // LoopVectorizeHints). This will be fixed in the future when the native IR
1851 // representation for pragma 'omp simd' is introduced.
1852 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1853                                    OptimizationRemarkEmitter *ORE) {
1854   assert(!OuterLp->isInnermost() && "This is not an outer loop");
1855   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1856 
1857   // Only outer loops with an explicit vectorization hint are supported.
1858   // Unannotated outer loops are ignored.
1859   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1860     return false;
1861 
1862   Function *Fn = OuterLp->getHeader()->getParent();
1863   if (!Hints.allowVectorization(Fn, OuterLp,
1864                                 true /*VectorizeOnlyWhenForced*/)) {
1865     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1866     return false;
1867   }
1868 
1869   if (Hints.getInterleave() > 1) {
1870     // TODO: Interleave support is future work.
1871     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1872                          "outer loops.\n");
1873     Hints.emitRemarkWithHints();
1874     return false;
1875   }
1876 
1877   return true;
1878 }
1879 
1880 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1881                                   OptimizationRemarkEmitter *ORE,
1882                                   SmallVectorImpl<Loop *> &V) {
1883   // Collect inner loops and outer loops without irreducible control flow. For
1884   // now, only collect outer loops that have explicit vectorization hints. If we
1885   // are stress testing the VPlan H-CFG construction, we collect the outermost
1886   // loop of every loop nest.
1887   if (L.isInnermost() || VPlanBuildStressTest ||
1888       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1889     LoopBlocksRPO RPOT(&L);
1890     RPOT.perform(LI);
1891     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1892       V.push_back(&L);
1893       // TODO: Collect inner loops inside marked outer loops in case
1894       // vectorization fails for the outer loop. Do not invoke
1895       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1896       // already known to be reducible. We can use an inherited attribute for
1897       // that.
1898       return;
1899     }
1900   }
1901   for (Loop *InnerL : L)
1902     collectSupportedLoops(*InnerL, LI, ORE, V);
1903 }
1904 
1905 namespace {
1906 
1907 /// The LoopVectorize Pass.
1908 struct LoopVectorize : public FunctionPass {
1909   /// Pass identification, replacement for typeid
1910   static char ID;
1911 
1912   LoopVectorizePass Impl;
1913 
1914   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1915                          bool VectorizeOnlyWhenForced = false)
1916       : FunctionPass(ID),
1917         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1918     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1919   }
1920 
1921   bool runOnFunction(Function &F) override {
1922     if (skipFunction(F))
1923       return false;
1924 
1925     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1926     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1927     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1928     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1929     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1930     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1931     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1932     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1933     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1934     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1935     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1936     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1937     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1938 
1939     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1940         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1941 
1942     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1943                         GetLAA, *ORE, PSI).MadeAnyChange;
1944   }
1945 
1946   void getAnalysisUsage(AnalysisUsage &AU) const override {
1947     AU.addRequired<AssumptionCacheTracker>();
1948     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1949     AU.addRequired<DominatorTreeWrapperPass>();
1950     AU.addRequired<LoopInfoWrapperPass>();
1951     AU.addRequired<ScalarEvolutionWrapperPass>();
1952     AU.addRequired<TargetTransformInfoWrapperPass>();
1953     AU.addRequired<AAResultsWrapperPass>();
1954     AU.addRequired<LoopAccessLegacyAnalysis>();
1955     AU.addRequired<DemandedBitsWrapperPass>();
1956     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1957     AU.addRequired<InjectTLIMappingsLegacy>();
1958 
1959     // We currently do not preserve loopinfo/dominator analyses with outer loop
1960     // vectorization. Until this is addressed, mark these analyses as preserved
1961     // only for non-VPlan-native path.
1962     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1963     if (!EnableVPlanNativePath) {
1964       AU.addPreserved<LoopInfoWrapperPass>();
1965       AU.addPreserved<DominatorTreeWrapperPass>();
1966     }
1967 
1968     AU.addPreserved<BasicAAWrapperPass>();
1969     AU.addPreserved<GlobalsAAWrapperPass>();
1970     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1971   }
1972 };
1973 
1974 } // end anonymous namespace
1975 
1976 //===----------------------------------------------------------------------===//
1977 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1978 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1979 //===----------------------------------------------------------------------===//
1980 
1981 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1982   // We need to place the broadcast of invariant variables outside the loop,
1983   // but only if it's proven safe to do so. Else, broadcast will be inside
1984   // vector loop body.
1985   Instruction *Instr = dyn_cast<Instruction>(V);
1986   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1987                      (!Instr ||
1988                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1989   // Place the code for broadcasting invariant variables in the new preheader.
1990   IRBuilder<>::InsertPointGuard Guard(Builder);
1991   if (SafeToHoist)
1992     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1993 
1994   // Broadcast the scalar into all locations in the vector.
1995   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1996 
1997   return Shuf;
1998 }
1999 
2000 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2001     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
2002   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2003          "Expected either an induction phi-node or a truncate of it!");
2004   Value *Start = II.getStartValue();
2005 
2006   // Construct the initial value of the vector IV in the vector loop preheader
2007   auto CurrIP = Builder.saveIP();
2008   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2009   if (isa<TruncInst>(EntryVal)) {
2010     assert(Start->getType()->isIntegerTy() &&
2011            "Truncation requires an integer type");
2012     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2013     Step = Builder.CreateTrunc(Step, TruncType);
2014     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2015   }
2016   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2017   Value *SteppedStart =
2018       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2019 
2020   // We create vector phi nodes for both integer and floating-point induction
2021   // variables. Here, we determine the kind of arithmetic we will perform.
2022   Instruction::BinaryOps AddOp;
2023   Instruction::BinaryOps MulOp;
2024   if (Step->getType()->isIntegerTy()) {
2025     AddOp = Instruction::Add;
2026     MulOp = Instruction::Mul;
2027   } else {
2028     AddOp = II.getInductionOpcode();
2029     MulOp = Instruction::FMul;
2030   }
2031 
2032   // Multiply the vectorization factor by the step using integer or
2033   // floating-point arithmetic as appropriate.
2034   Value *ConstVF =
2035       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
2036   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2037 
2038   // Create a vector splat to use in the induction update.
2039   //
2040   // FIXME: If the step is non-constant, we create the vector splat with
2041   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2042   //        handle a constant vector splat.
2043   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2044   Value *SplatVF = isa<Constant>(Mul)
2045                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2046                        : Builder.CreateVectorSplat(VF, Mul);
2047   Builder.restoreIP(CurrIP);
2048 
2049   // We may need to add the step a number of times, depending on the unroll
2050   // factor. The last of those goes into the PHI.
2051   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2052                                     &*LoopVectorBody->getFirstInsertionPt());
2053   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2054   Instruction *LastInduction = VecInd;
2055   for (unsigned Part = 0; Part < UF; ++Part) {
2056     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2057 
2058     if (isa<TruncInst>(EntryVal))
2059       addMetadata(LastInduction, EntryVal);
2060     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
2061 
2062     LastInduction = cast<Instruction>(addFastMathFlag(
2063         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2064     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2065   }
2066 
2067   // Move the last step to the end of the latch block. This ensures consistent
2068   // placement of all induction updates.
2069   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2070   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2071   auto *ICmp = cast<Instruction>(Br->getCondition());
2072   LastInduction->moveBefore(ICmp);
2073   LastInduction->setName("vec.ind.next");
2074 
2075   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2076   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2077 }
2078 
2079 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2080   return Cost->isScalarAfterVectorization(I, VF) ||
2081          Cost->isProfitableToScalarize(I, VF);
2082 }
2083 
2084 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2085   if (shouldScalarizeInstruction(IV))
2086     return true;
2087   auto isScalarInst = [&](User *U) -> bool {
2088     auto *I = cast<Instruction>(U);
2089     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2090   };
2091   return llvm::any_of(IV->users(), isScalarInst);
2092 }
2093 
2094 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2095     const InductionDescriptor &ID, const Instruction *EntryVal,
2096     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
2097   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2098          "Expected either an induction phi-node or a truncate of it!");
2099 
2100   // This induction variable is not the phi from the original loop but the
2101   // newly-created IV based on the proof that casted Phi is equal to the
2102   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2103   // re-uses the same InductionDescriptor that original IV uses but we don't
2104   // have to do any recording in this case - that is done when original IV is
2105   // processed.
2106   if (isa<TruncInst>(EntryVal))
2107     return;
2108 
2109   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2110   if (Casts.empty())
2111     return;
2112   // Only the first Cast instruction in the Casts vector is of interest.
2113   // The rest of the Casts (if exist) have no uses outside the
2114   // induction update chain itself.
2115   Instruction *CastInst = *Casts.begin();
2116   if (Lane < UINT_MAX)
2117     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
2118   else
2119     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
2120 }
2121 
2122 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
2123   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2124          "Primary induction variable must have an integer type");
2125 
2126   auto II = Legal->getInductionVars().find(IV);
2127   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2128 
2129   auto ID = II->second;
2130   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2131 
2132   // The value from the original loop to which we are mapping the new induction
2133   // variable.
2134   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2135 
2136   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2137 
2138   // Generate code for the induction step. Note that induction steps are
2139   // required to be loop-invariant
2140   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2141     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2142            "Induction step should be loop invariant");
2143     if (PSE.getSE()->isSCEVable(IV->getType())) {
2144       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2145       return Exp.expandCodeFor(Step, Step->getType(),
2146                                LoopVectorPreHeader->getTerminator());
2147     }
2148     return cast<SCEVUnknown>(Step)->getValue();
2149   };
2150 
2151   // The scalar value to broadcast. This is derived from the canonical
2152   // induction variable. If a truncation type is given, truncate the canonical
2153   // induction variable and step. Otherwise, derive these values from the
2154   // induction descriptor.
2155   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2156     Value *ScalarIV = Induction;
2157     if (IV != OldInduction) {
2158       ScalarIV = IV->getType()->isIntegerTy()
2159                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2160                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2161                                           IV->getType());
2162       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2163       ScalarIV->setName("offset.idx");
2164     }
2165     if (Trunc) {
2166       auto *TruncType = cast<IntegerType>(Trunc->getType());
2167       assert(Step->getType()->isIntegerTy() &&
2168              "Truncation requires an integer step");
2169       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2170       Step = Builder.CreateTrunc(Step, TruncType);
2171     }
2172     return ScalarIV;
2173   };
2174 
2175   // Create the vector values from the scalar IV, in the absence of creating a
2176   // vector IV.
2177   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2178     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2179     for (unsigned Part = 0; Part < UF; ++Part) {
2180       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2181       Value *EntryPart =
2182           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2183                         ID.getInductionOpcode());
2184       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2185       if (Trunc)
2186         addMetadata(EntryPart, Trunc);
2187       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2188     }
2189   };
2190 
2191   // Now do the actual transformations, and start with creating the step value.
2192   Value *Step = CreateStepValue(ID.getStep());
2193   if (VF.isZero() || VF.isScalar()) {
2194     Value *ScalarIV = CreateScalarIV(Step);
2195     CreateSplatIV(ScalarIV, Step);
2196     return;
2197   }
2198 
2199   // Determine if we want a scalar version of the induction variable. This is
2200   // true if the induction variable itself is not widened, or if it has at
2201   // least one user in the loop that is not widened.
2202   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2203   if (!NeedsScalarIV) {
2204     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2205     return;
2206   }
2207 
2208   // Try to create a new independent vector induction variable. If we can't
2209   // create the phi node, we will splat the scalar induction variable in each
2210   // loop iteration.
2211   if (!shouldScalarizeInstruction(EntryVal)) {
2212     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2213     Value *ScalarIV = CreateScalarIV(Step);
2214     // Create scalar steps that can be used by instructions we will later
2215     // scalarize. Note that the addition of the scalar steps will not increase
2216     // the number of instructions in the loop in the common case prior to
2217     // InstCombine. We will be trading one vector extract for each scalar step.
2218     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2219     return;
2220   }
2221 
2222   // All IV users are scalar instructions, so only emit a scalar IV, not a
2223   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2224   // predicate used by the masked loads/stores.
2225   Value *ScalarIV = CreateScalarIV(Step);
2226   if (!Cost->isScalarEpilogueAllowed())
2227     CreateSplatIV(ScalarIV, Step);
2228   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2229 }
2230 
2231 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2232                                           Instruction::BinaryOps BinOp) {
2233   // Create and check the types.
2234   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2235   int VLen = ValVTy->getNumElements();
2236 
2237   Type *STy = Val->getType()->getScalarType();
2238   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2239          "Induction Step must be an integer or FP");
2240   assert(Step->getType() == STy && "Step has wrong type");
2241 
2242   SmallVector<Constant *, 8> Indices;
2243 
2244   if (STy->isIntegerTy()) {
2245     // Create a vector of consecutive numbers from zero to VF.
2246     for (int i = 0; i < VLen; ++i)
2247       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2248 
2249     // Add the consecutive indices to the vector value.
2250     Constant *Cv = ConstantVector::get(Indices);
2251     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2252     Step = Builder.CreateVectorSplat(VLen, Step);
2253     assert(Step->getType() == Val->getType() && "Invalid step vec");
2254     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2255     // which can be found from the original scalar operations.
2256     Step = Builder.CreateMul(Cv, Step);
2257     return Builder.CreateAdd(Val, Step, "induction");
2258   }
2259 
2260   // Floating point induction.
2261   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2262          "Binary Opcode should be specified for FP induction");
2263   // Create a vector of consecutive numbers from zero to VF.
2264   for (int i = 0; i < VLen; ++i)
2265     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2266 
2267   // Add the consecutive indices to the vector value.
2268   Constant *Cv = ConstantVector::get(Indices);
2269 
2270   Step = Builder.CreateVectorSplat(VLen, Step);
2271 
2272   // Floating point operations had to be 'fast' to enable the induction.
2273   FastMathFlags Flags;
2274   Flags.setFast();
2275 
2276   Value *MulOp = Builder.CreateFMul(Cv, Step);
2277   if (isa<Instruction>(MulOp))
2278     // Have to check, MulOp may be a constant
2279     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2280 
2281   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2282   if (isa<Instruction>(BOp))
2283     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2284   return BOp;
2285 }
2286 
2287 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2288                                            Instruction *EntryVal,
2289                                            const InductionDescriptor &ID) {
2290   // We shouldn't have to build scalar steps if we aren't vectorizing.
2291   assert(VF.isVector() && "VF should be greater than one");
2292   // Get the value type and ensure it and the step have the same integer type.
2293   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2294   assert(ScalarIVTy == Step->getType() &&
2295          "Val and Step should have the same type");
2296 
2297   // We build scalar steps for both integer and floating-point induction
2298   // variables. Here, we determine the kind of arithmetic we will perform.
2299   Instruction::BinaryOps AddOp;
2300   Instruction::BinaryOps MulOp;
2301   if (ScalarIVTy->isIntegerTy()) {
2302     AddOp = Instruction::Add;
2303     MulOp = Instruction::Mul;
2304   } else {
2305     AddOp = ID.getInductionOpcode();
2306     MulOp = Instruction::FMul;
2307   }
2308 
2309   // Determine the number of scalars we need to generate for each unroll
2310   // iteration. If EntryVal is uniform, we only need to generate the first
2311   // lane. Otherwise, we generate all VF values.
2312   unsigned Lanes =
2313       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2314           ? 1
2315           : VF.getKnownMinValue();
2316   assert((!VF.isScalable() || Lanes == 1) &&
2317          "Should never scalarize a scalable vector");
2318   // Compute the scalar steps and save the results in VectorLoopValueMap.
2319   for (unsigned Part = 0; Part < UF; ++Part) {
2320     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2321       auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2322                                          ScalarIVTy->getScalarSizeInBits());
2323       Value *StartIdx =
2324           createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2325       if (ScalarIVTy->isFloatingPointTy())
2326         StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
2327       StartIdx = addFastMathFlag(Builder.CreateBinOp(
2328           AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
2329       // The step returned by `createStepForVF` is a runtime-evaluated value
2330       // when VF is scalable. Otherwise, it should be folded into a Constant.
2331       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2332              "Expected StartIdx to be folded to a constant when VF is not "
2333              "scalable");
2334       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2335       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2336       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2337       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2338     }
2339   }
2340 }
2341 
2342 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2343   assert(V != Induction && "The new induction variable should not be used.");
2344   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2345   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2346 
2347   // If we have a stride that is replaced by one, do it here. Defer this for
2348   // the VPlan-native path until we start running Legal checks in that path.
2349   if (!EnableVPlanNativePath && Legal->hasStride(V))
2350     V = ConstantInt::get(V->getType(), 1);
2351 
2352   // If we have a vector mapped to this value, return it.
2353   if (VectorLoopValueMap.hasVectorValue(V, Part))
2354     return VectorLoopValueMap.getVectorValue(V, Part);
2355 
2356   // If the value has not been vectorized, check if it has been scalarized
2357   // instead. If it has been scalarized, and we actually need the value in
2358   // vector form, we will construct the vector values on demand.
2359   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2360     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2361 
2362     // If we've scalarized a value, that value should be an instruction.
2363     auto *I = cast<Instruction>(V);
2364 
2365     // If we aren't vectorizing, we can just copy the scalar map values over to
2366     // the vector map.
2367     if (VF.isScalar()) {
2368       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2369       return ScalarValue;
2370     }
2371 
2372     // Get the last scalar instruction we generated for V and Part. If the value
2373     // is known to be uniform after vectorization, this corresponds to lane zero
2374     // of the Part unroll iteration. Otherwise, the last instruction is the one
2375     // we created for the last vector lane of the Part unroll iteration.
2376     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2377                             ? 0
2378                             : VF.getKnownMinValue() - 1;
2379     assert((!VF.isScalable() || LastLane == 0) &&
2380            "Scalable vectorization can't lead to any scalarized values.");
2381     auto *LastInst = cast<Instruction>(
2382         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2383 
2384     // Set the insert point after the last scalarized instruction. This ensures
2385     // the insertelement sequence will directly follow the scalar definitions.
2386     auto OldIP = Builder.saveIP();
2387     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2388     Builder.SetInsertPoint(&*NewIP);
2389 
2390     // However, if we are vectorizing, we need to construct the vector values.
2391     // If the value is known to be uniform after vectorization, we can just
2392     // broadcast the scalar value corresponding to lane zero for each unroll
2393     // iteration. Otherwise, we construct the vector values using insertelement
2394     // instructions. Since the resulting vectors are stored in
2395     // VectorLoopValueMap, we will only generate the insertelements once.
2396     Value *VectorValue = nullptr;
2397     if (Cost->isUniformAfterVectorization(I, VF)) {
2398       VectorValue = getBroadcastInstrs(ScalarValue);
2399       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2400     } else {
2401       // Initialize packing with insertelements to start from undef.
2402       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2403       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2404       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2405       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2406         packScalarIntoVectorValue(V, {Part, Lane});
2407       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2408     }
2409     Builder.restoreIP(OldIP);
2410     return VectorValue;
2411   }
2412 
2413   // If this scalar is unknown, assume that it is a constant or that it is
2414   // loop invariant. Broadcast V and save the value for future uses.
2415   Value *B = getBroadcastInstrs(V);
2416   VectorLoopValueMap.setVectorValue(V, Part, B);
2417   return B;
2418 }
2419 
2420 Value *
2421 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2422                                             const VPIteration &Instance) {
2423   // If the value is not an instruction contained in the loop, it should
2424   // already be scalar.
2425   if (OrigLoop->isLoopInvariant(V))
2426     return V;
2427 
2428   assert(Instance.Lane > 0
2429              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2430              : true && "Uniform values only have lane zero");
2431 
2432   // If the value from the original loop has not been vectorized, it is
2433   // represented by UF x VF scalar values in the new loop. Return the requested
2434   // scalar value.
2435   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2436     return VectorLoopValueMap.getScalarValue(V, Instance);
2437 
2438   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2439   // for the given unroll part. If this entry is not a vector type (i.e., the
2440   // vectorization factor is one), there is no need to generate an
2441   // extractelement instruction.
2442   auto *U = getOrCreateVectorValue(V, Instance.Part);
2443   if (!U->getType()->isVectorTy()) {
2444     assert(VF.isScalar() && "Value not scalarized has non-vector type");
2445     return U;
2446   }
2447 
2448   // Otherwise, the value from the original loop has been vectorized and is
2449   // represented by UF vector values. Extract and return the requested scalar
2450   // value from the appropriate vector lane.
2451   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2452 }
2453 
2454 void InnerLoopVectorizer::packScalarIntoVectorValue(
2455     Value *V, const VPIteration &Instance) {
2456   assert(V != Induction && "The new induction variable should not be used.");
2457   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2458   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2459 
2460   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2461   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2462   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2463                                             Builder.getInt32(Instance.Lane));
2464   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2465 }
2466 
2467 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2468   assert(Vec->getType()->isVectorTy() && "Invalid type");
2469   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2470   SmallVector<int, 8> ShuffleMask;
2471   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2472     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2473 
2474   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2475 }
2476 
2477 // Return whether we allow using masked interleave-groups (for dealing with
2478 // strided loads/stores that reside in predicated blocks, or for dealing
2479 // with gaps).
2480 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2481   // If an override option has been passed in for interleaved accesses, use it.
2482   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2483     return EnableMaskedInterleavedMemAccesses;
2484 
2485   return TTI.enableMaskedInterleavedAccessVectorization();
2486 }
2487 
2488 // Try to vectorize the interleave group that \p Instr belongs to.
2489 //
2490 // E.g. Translate following interleaved load group (factor = 3):
2491 //   for (i = 0; i < N; i+=3) {
2492 //     R = Pic[i];             // Member of index 0
2493 //     G = Pic[i+1];           // Member of index 1
2494 //     B = Pic[i+2];           // Member of index 2
2495 //     ... // do something to R, G, B
2496 //   }
2497 // To:
2498 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2499 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2500 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2501 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2502 //
2503 // Or translate following interleaved store group (factor = 3):
2504 //   for (i = 0; i < N; i+=3) {
2505 //     ... do something to R, G, B
2506 //     Pic[i]   = R;           // Member of index 0
2507 //     Pic[i+1] = G;           // Member of index 1
2508 //     Pic[i+2] = B;           // Member of index 2
2509 //   }
2510 // To:
2511 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2512 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2513 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2514 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2515 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2516 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2517     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2518     VPValue *Addr, ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask) {
2519   Instruction *Instr = Group->getInsertPos();
2520   const DataLayout &DL = Instr->getModule()->getDataLayout();
2521 
2522   // Prepare for the vector type of the interleaved load/store.
2523   Type *ScalarTy = getMemInstValueType(Instr);
2524   unsigned InterleaveFactor = Group->getFactor();
2525   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2526   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2527 
2528   // Prepare for the new pointers.
2529   SmallVector<Value *, 2> AddrParts;
2530   unsigned Index = Group->getIndex(Instr);
2531 
2532   // TODO: extend the masked interleaved-group support to reversed access.
2533   assert((!BlockInMask || !Group->isReverse()) &&
2534          "Reversed masked interleave-group not supported.");
2535 
2536   // If the group is reverse, adjust the index to refer to the last vector lane
2537   // instead of the first. We adjust the index from the first vector lane,
2538   // rather than directly getting the pointer for lane VF - 1, because the
2539   // pointer operand of the interleaved access is supposed to be uniform. For
2540   // uniform instructions, we're only required to generate a value for the
2541   // first vector lane in each unroll iteration.
2542   assert(!VF.isScalable() &&
2543          "scalable vector reverse operation is not implemented");
2544   if (Group->isReverse())
2545     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2546 
2547   for (unsigned Part = 0; Part < UF; Part++) {
2548     Value *AddrPart = State.get(Addr, {Part, 0});
2549     setDebugLocFromInst(Builder, AddrPart);
2550 
2551     // Notice current instruction could be any index. Need to adjust the address
2552     // to the member of index 0.
2553     //
2554     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2555     //       b = A[i];       // Member of index 0
2556     // Current pointer is pointed to A[i+1], adjust it to A[i].
2557     //
2558     // E.g.  A[i+1] = a;     // Member of index 1
2559     //       A[i]   = b;     // Member of index 0
2560     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2561     // Current pointer is pointed to A[i+2], adjust it to A[i].
2562 
2563     bool InBounds = false;
2564     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2565       InBounds = gep->isInBounds();
2566     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2567     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2568 
2569     // Cast to the vector pointer type.
2570     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2571     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2572     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2573   }
2574 
2575   setDebugLocFromInst(Builder, Instr);
2576   Value *UndefVec = UndefValue::get(VecTy);
2577 
2578   Value *MaskForGaps = nullptr;
2579   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2580     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2581     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2582     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2583   }
2584 
2585   // Vectorize the interleaved load group.
2586   if (isa<LoadInst>(Instr)) {
2587     // For each unroll part, create a wide load for the group.
2588     SmallVector<Value *, 2> NewLoads;
2589     for (unsigned Part = 0; Part < UF; Part++) {
2590       Instruction *NewLoad;
2591       if (BlockInMask || MaskForGaps) {
2592         assert(useMaskedInterleavedAccesses(*TTI) &&
2593                "masked interleaved groups are not allowed.");
2594         Value *GroupMask = MaskForGaps;
2595         if (BlockInMask) {
2596           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2597           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2598           Value *ShuffledMask = Builder.CreateShuffleVector(
2599               BlockInMaskPart,
2600               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2601               "interleaved.mask");
2602           GroupMask = MaskForGaps
2603                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2604                                                 MaskForGaps)
2605                           : ShuffledMask;
2606         }
2607         NewLoad =
2608             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2609                                      GroupMask, UndefVec, "wide.masked.vec");
2610       }
2611       else
2612         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2613                                             Group->getAlign(), "wide.vec");
2614       Group->addMetadata(NewLoad);
2615       NewLoads.push_back(NewLoad);
2616     }
2617 
2618     // For each member in the group, shuffle out the appropriate data from the
2619     // wide loads.
2620     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2621       Instruction *Member = Group->getMember(I);
2622 
2623       // Skip the gaps in the group.
2624       if (!Member)
2625         continue;
2626 
2627       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2628       auto StrideMask =
2629           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2630       for (unsigned Part = 0; Part < UF; Part++) {
2631         Value *StridedVec = Builder.CreateShuffleVector(
2632             NewLoads[Part], StrideMask, "strided.vec");
2633 
2634         // If this member has different type, cast the result type.
2635         if (Member->getType() != ScalarTy) {
2636           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2637           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2638           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2639         }
2640 
2641         if (Group->isReverse())
2642           StridedVec = reverseVector(StridedVec);
2643 
2644         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2645       }
2646     }
2647     return;
2648   }
2649 
2650   // The sub vector type for current instruction.
2651   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2652   auto *SubVT = VectorType::get(ScalarTy, VF);
2653 
2654   // Vectorize the interleaved store group.
2655   for (unsigned Part = 0; Part < UF; Part++) {
2656     // Collect the stored vector from each member.
2657     SmallVector<Value *, 4> StoredVecs;
2658     for (unsigned i = 0; i < InterleaveFactor; i++) {
2659       // Interleaved store group doesn't allow a gap, so each index has a member
2660       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2661 
2662       Value *StoredVec = State.get(StoredValues[i], Part);
2663 
2664       if (Group->isReverse())
2665         StoredVec = reverseVector(StoredVec);
2666 
2667       // If this member has different type, cast it to a unified type.
2668 
2669       if (StoredVec->getType() != SubVT)
2670         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2671 
2672       StoredVecs.push_back(StoredVec);
2673     }
2674 
2675     // Concatenate all vectors into a wide vector.
2676     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2677 
2678     // Interleave the elements in the wide vector.
2679     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2680     Value *IVec = Builder.CreateShuffleVector(
2681         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2682         "interleaved.vec");
2683 
2684     Instruction *NewStoreInstr;
2685     if (BlockInMask) {
2686       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2687       Value *ShuffledMask = Builder.CreateShuffleVector(
2688           BlockInMaskPart,
2689           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2690           "interleaved.mask");
2691       NewStoreInstr = Builder.CreateMaskedStore(
2692           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2693     }
2694     else
2695       NewStoreInstr =
2696           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2697 
2698     Group->addMetadata(NewStoreInstr);
2699   }
2700 }
2701 
2702 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2703     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2704     VPValue *StoredValue, VPValue *BlockInMask) {
2705   // Attempt to issue a wide load.
2706   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2707   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2708 
2709   assert((LI || SI) && "Invalid Load/Store instruction");
2710   assert((!SI || StoredValue) && "No stored value provided for widened store");
2711   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2712 
2713   LoopVectorizationCostModel::InstWidening Decision =
2714       Cost->getWideningDecision(Instr, VF);
2715   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2716           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2717           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2718          "CM decision is not to widen the memory instruction");
2719 
2720   Type *ScalarDataTy = getMemInstValueType(Instr);
2721 
2722   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2723   const Align Alignment = getLoadStoreAlignment(Instr);
2724 
2725   // Determine if the pointer operand of the access is either consecutive or
2726   // reverse consecutive.
2727   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2728   bool ConsecutiveStride =
2729       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2730   bool CreateGatherScatter =
2731       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2732 
2733   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2734   // gather/scatter. Otherwise Decision should have been to Scalarize.
2735   assert((ConsecutiveStride || CreateGatherScatter) &&
2736          "The instruction should be scalarized");
2737   (void)ConsecutiveStride;
2738 
2739   VectorParts BlockInMaskParts(UF);
2740   bool isMaskRequired = BlockInMask;
2741   if (isMaskRequired)
2742     for (unsigned Part = 0; Part < UF; ++Part)
2743       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2744 
2745   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2746     // Calculate the pointer for the specific unroll-part.
2747     GetElementPtrInst *PartPtr = nullptr;
2748 
2749     bool InBounds = false;
2750     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2751       InBounds = gep->isInBounds();
2752 
2753     if (Reverse) {
2754       assert(!VF.isScalable() &&
2755              "Reversing vectors is not yet supported for scalable vectors.");
2756 
2757       // If the address is consecutive but reversed, then the
2758       // wide store needs to start at the last vector element.
2759       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2760           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2761       PartPtr->setIsInBounds(InBounds);
2762       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2763           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2764       PartPtr->setIsInBounds(InBounds);
2765       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2766         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2767     } else {
2768       Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2769       PartPtr = cast<GetElementPtrInst>(
2770           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2771       PartPtr->setIsInBounds(InBounds);
2772     }
2773 
2774     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2775     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2776   };
2777 
2778   // Handle Stores:
2779   if (SI) {
2780     setDebugLocFromInst(Builder, SI);
2781 
2782     for (unsigned Part = 0; Part < UF; ++Part) {
2783       Instruction *NewSI = nullptr;
2784       Value *StoredVal = State.get(StoredValue, Part);
2785       if (CreateGatherScatter) {
2786         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2787         Value *VectorGep = State.get(Addr, Part);
2788         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2789                                             MaskPart);
2790       } else {
2791         if (Reverse) {
2792           // If we store to reverse consecutive memory locations, then we need
2793           // to reverse the order of elements in the stored value.
2794           StoredVal = reverseVector(StoredVal);
2795           // We don't want to update the value in the map as it might be used in
2796           // another expression. So don't call resetVectorValue(StoredVal).
2797         }
2798         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2799         if (isMaskRequired)
2800           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2801                                             BlockInMaskParts[Part]);
2802         else
2803           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2804       }
2805       addMetadata(NewSI, SI);
2806     }
2807     return;
2808   }
2809 
2810   // Handle loads.
2811   assert(LI && "Must have a load instruction");
2812   setDebugLocFromInst(Builder, LI);
2813   for (unsigned Part = 0; Part < UF; ++Part) {
2814     Value *NewLI;
2815     if (CreateGatherScatter) {
2816       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2817       Value *VectorGep = State.get(Addr, Part);
2818       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2819                                          nullptr, "wide.masked.gather");
2820       addMetadata(NewLI, LI);
2821     } else {
2822       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2823       if (isMaskRequired)
2824         NewLI = Builder.CreateMaskedLoad(
2825             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2826             "wide.masked.load");
2827       else
2828         NewLI =
2829             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2830 
2831       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2832       addMetadata(NewLI, LI);
2833       if (Reverse)
2834         NewLI = reverseVector(NewLI);
2835     }
2836 
2837     State.set(Def, Instr, NewLI, Part);
2838   }
2839 }
2840 
2841 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2842                                                const VPIteration &Instance,
2843                                                bool IfPredicateInstr,
2844                                                VPTransformState &State) {
2845   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2846 
2847   setDebugLocFromInst(Builder, Instr);
2848 
2849   // Does this instruction return a value ?
2850   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2851 
2852   Instruction *Cloned = Instr->clone();
2853   if (!IsVoidRetTy)
2854     Cloned->setName(Instr->getName() + ".cloned");
2855 
2856   // Replace the operands of the cloned instructions with their scalar
2857   // equivalents in the new loop.
2858   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2859     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2860     auto InputInstance = Instance;
2861     if (!Operand || !OrigLoop->contains(Operand) ||
2862         (Cost->isUniformAfterVectorization(Operand, State.VF)))
2863       InputInstance.Lane = 0;
2864     auto *NewOp = State.get(User.getOperand(op), InputInstance);
2865     Cloned->setOperand(op, NewOp);
2866   }
2867   addNewMetadata(Cloned, Instr);
2868 
2869   // Place the cloned scalar in the new loop.
2870   Builder.Insert(Cloned);
2871 
2872   // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
2873   // representing scalar values in VPTransformState. Add the cloned scalar to
2874   // the scalar map entry.
2875   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2876 
2877   // If we just cloned a new assumption, add it the assumption cache.
2878   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2879     if (II->getIntrinsicID() == Intrinsic::assume)
2880       AC->registerAssumption(II);
2881 
2882   // End if-block.
2883   if (IfPredicateInstr)
2884     PredicatedInstructions.push_back(Cloned);
2885 }
2886 
2887 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2888                                                       Value *End, Value *Step,
2889                                                       Instruction *DL) {
2890   BasicBlock *Header = L->getHeader();
2891   BasicBlock *Latch = L->getLoopLatch();
2892   // As we're just creating this loop, it's possible no latch exists
2893   // yet. If so, use the header as this will be a single block loop.
2894   if (!Latch)
2895     Latch = Header;
2896 
2897   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2898   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2899   setDebugLocFromInst(Builder, OldInst);
2900   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2901 
2902   Builder.SetInsertPoint(Latch->getTerminator());
2903   setDebugLocFromInst(Builder, OldInst);
2904 
2905   // Create i+1 and fill the PHINode.
2906   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2907   Induction->addIncoming(Start, L->getLoopPreheader());
2908   Induction->addIncoming(Next, Latch);
2909   // Create the compare.
2910   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2911   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2912 
2913   // Now we have two terminators. Remove the old one from the block.
2914   Latch->getTerminator()->eraseFromParent();
2915 
2916   return Induction;
2917 }
2918 
2919 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2920   if (TripCount)
2921     return TripCount;
2922 
2923   assert(L && "Create Trip Count for null loop.");
2924   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2925   // Find the loop boundaries.
2926   ScalarEvolution *SE = PSE.getSE();
2927   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2928   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2929          "Invalid loop count");
2930 
2931   Type *IdxTy = Legal->getWidestInductionType();
2932   assert(IdxTy && "No type for induction");
2933 
2934   // The exit count might have the type of i64 while the phi is i32. This can
2935   // happen if we have an induction variable that is sign extended before the
2936   // compare. The only way that we get a backedge taken count is that the
2937   // induction variable was signed and as such will not overflow. In such a case
2938   // truncation is legal.
2939   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2940       IdxTy->getPrimitiveSizeInBits())
2941     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2942   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2943 
2944   // Get the total trip count from the count by adding 1.
2945   const SCEV *ExitCount = SE->getAddExpr(
2946       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2947 
2948   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2949 
2950   // Expand the trip count and place the new instructions in the preheader.
2951   // Notice that the pre-header does not change, only the loop body.
2952   SCEVExpander Exp(*SE, DL, "induction");
2953 
2954   // Count holds the overall loop count (N).
2955   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2956                                 L->getLoopPreheader()->getTerminator());
2957 
2958   if (TripCount->getType()->isPointerTy())
2959     TripCount =
2960         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2961                                     L->getLoopPreheader()->getTerminator());
2962 
2963   return TripCount;
2964 }
2965 
2966 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2967   if (VectorTripCount)
2968     return VectorTripCount;
2969 
2970   Value *TC = getOrCreateTripCount(L);
2971   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2972 
2973   Type *Ty = TC->getType();
2974   // This is where we can make the step a runtime constant.
2975   Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
2976 
2977   // If the tail is to be folded by masking, round the number of iterations N
2978   // up to a multiple of Step instead of rounding down. This is done by first
2979   // adding Step-1 and then rounding down. Note that it's ok if this addition
2980   // overflows: the vector induction variable will eventually wrap to zero given
2981   // that it starts at zero and its Step is a power of two; the loop will then
2982   // exit, with the last early-exit vector comparison also producing all-true.
2983   if (Cost->foldTailByMasking()) {
2984     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2985            "VF*UF must be a power of 2 when folding tail by masking");
2986     assert(!VF.isScalable() &&
2987            "Tail folding not yet supported for scalable vectors");
2988     TC = Builder.CreateAdd(
2989         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2990   }
2991 
2992   // Now we need to generate the expression for the part of the loop that the
2993   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2994   // iterations are not required for correctness, or N - Step, otherwise. Step
2995   // is equal to the vectorization factor (number of SIMD elements) times the
2996   // unroll factor (number of SIMD instructions).
2997   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2998 
2999   // If there is a non-reversed interleaved group that may speculatively access
3000   // memory out-of-bounds, we need to ensure that there will be at least one
3001   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
3002   // the trip count, we set the remainder to be equal to the step. If the step
3003   // does not evenly divide the trip count, no adjustment is necessary since
3004   // there will already be scalar iterations. Note that the minimum iterations
3005   // check ensures that N >= Step.
3006   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3007     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3008     R = Builder.CreateSelect(IsZero, Step, R);
3009   }
3010 
3011   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3012 
3013   return VectorTripCount;
3014 }
3015 
3016 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3017                                                    const DataLayout &DL) {
3018   // Verify that V is a vector type with same number of elements as DstVTy.
3019   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3020   unsigned VF = DstFVTy->getNumElements();
3021   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3022   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3023   Type *SrcElemTy = SrcVecTy->getElementType();
3024   Type *DstElemTy = DstFVTy->getElementType();
3025   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3026          "Vector elements must have same size");
3027 
3028   // Do a direct cast if element types are castable.
3029   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3030     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3031   }
3032   // V cannot be directly casted to desired vector type.
3033   // May happen when V is a floating point vector but DstVTy is a vector of
3034   // pointers or vice-versa. Handle this using a two-step bitcast using an
3035   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3036   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3037          "Only one type should be a pointer type");
3038   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3039          "Only one type should be a floating point type");
3040   Type *IntTy =
3041       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3042   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3043   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3044   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3045 }
3046 
3047 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3048                                                          BasicBlock *Bypass) {
3049   Value *Count = getOrCreateTripCount(L);
3050   // Reuse existing vector loop preheader for TC checks.
3051   // Note that new preheader block is generated for vector loop.
3052   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3053   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3054 
3055   // Generate code to check if the loop's trip count is less than VF * UF, or
3056   // equal to it in case a scalar epilogue is required; this implies that the
3057   // vector trip count is zero. This check also covers the case where adding one
3058   // to the backedge-taken count overflowed leading to an incorrect trip count
3059   // of zero. In this case we will also jump to the scalar loop.
3060   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3061                                           : ICmpInst::ICMP_ULT;
3062 
3063   // If tail is to be folded, vector loop takes care of all iterations.
3064   Value *CheckMinIters = Builder.getFalse();
3065   if (!Cost->foldTailByMasking()) {
3066     Value *Step =
3067         createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3068     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3069   }
3070   // Create new preheader for vector loop.
3071   LoopVectorPreHeader =
3072       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3073                  "vector.ph");
3074 
3075   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3076                                DT->getNode(Bypass)->getIDom()) &&
3077          "TC check is expected to dominate Bypass");
3078 
3079   // Update dominator for Bypass & LoopExit.
3080   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3081   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3082 
3083   ReplaceInstWithInst(
3084       TCCheckBlock->getTerminator(),
3085       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3086   LoopBypassBlocks.push_back(TCCheckBlock);
3087 }
3088 
3089 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3090   // Reuse existing vector loop preheader for SCEV checks.
3091   // Note that new preheader block is generated for vector loop.
3092   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
3093 
3094   // Generate the code to check that the SCEV assumptions that we made.
3095   // We want the new basic block to start at the first instruction in a
3096   // sequence of instructions that form a check.
3097   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3098                    "scev.check");
3099   Value *SCEVCheck = Exp.expandCodeForPredicate(
3100       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
3101 
3102   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3103     if (C->isZero())
3104       return;
3105 
3106   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3107            (OptForSizeBasedOnProfile &&
3108             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3109          "Cannot SCEV check stride or overflow when optimizing for size");
3110 
3111   SCEVCheckBlock->setName("vector.scevcheck");
3112   // Create new preheader for vector loop.
3113   LoopVectorPreHeader =
3114       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
3115                  nullptr, "vector.ph");
3116 
3117   // Update dominator only if this is first RT check.
3118   if (LoopBypassBlocks.empty()) {
3119     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3120     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3121   }
3122 
3123   ReplaceInstWithInst(
3124       SCEVCheckBlock->getTerminator(),
3125       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
3126   LoopBypassBlocks.push_back(SCEVCheckBlock);
3127   AddedSafetyChecks = true;
3128 }
3129 
3130 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3131   // VPlan-native path does not do any analysis for runtime checks currently.
3132   if (EnableVPlanNativePath)
3133     return;
3134 
3135   // Reuse existing vector loop preheader for runtime memory checks.
3136   // Note that new preheader block is generated for vector loop.
3137   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
3138 
3139   // Generate the code that checks in runtime if arrays overlap. We put the
3140   // checks into a separate block to make the more common case of few elements
3141   // faster.
3142   auto *LAI = Legal->getLAI();
3143   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
3144   if (!RtPtrChecking.Need)
3145     return;
3146 
3147   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3148     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3149            "Cannot emit memory checks when optimizing for size, unless forced "
3150            "to vectorize.");
3151     ORE->emit([&]() {
3152       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3153                                         L->getStartLoc(), L->getHeader())
3154              << "Code-size may be reduced by not forcing "
3155                 "vectorization, or by source-code modifications "
3156                 "eliminating the need for runtime checks "
3157                 "(e.g., adding 'restrict').";
3158     });
3159   }
3160 
3161   MemCheckBlock->setName("vector.memcheck");
3162   // Create new preheader for vector loop.
3163   LoopVectorPreHeader =
3164       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
3165                  "vector.ph");
3166 
3167   auto *CondBranch = cast<BranchInst>(
3168       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
3169   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
3170   LoopBypassBlocks.push_back(MemCheckBlock);
3171   AddedSafetyChecks = true;
3172 
3173   // Update dominator only if this is first RT check.
3174   if (LoopBypassBlocks.empty()) {
3175     DT->changeImmediateDominator(Bypass, MemCheckBlock);
3176     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
3177   }
3178 
3179   Instruction *FirstCheckInst;
3180   Instruction *MemRuntimeCheck;
3181   std::tie(FirstCheckInst, MemRuntimeCheck) =
3182       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
3183                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
3184   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
3185                             "claimed checks are required");
3186   CondBranch->setCondition(MemRuntimeCheck);
3187 
3188   // We currently don't use LoopVersioning for the actual loop cloning but we
3189   // still use it to add the noalias metadata.
3190   LVer = std::make_unique<LoopVersioning>(
3191       *Legal->getLAI(),
3192       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3193       DT, PSE.getSE());
3194   LVer->prepareNoAliasMetadata();
3195 }
3196 
3197 Value *InnerLoopVectorizer::emitTransformedIndex(
3198     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3199     const InductionDescriptor &ID) const {
3200 
3201   SCEVExpander Exp(*SE, DL, "induction");
3202   auto Step = ID.getStep();
3203   auto StartValue = ID.getStartValue();
3204   assert(Index->getType() == Step->getType() &&
3205          "Index type does not match StepValue type");
3206 
3207   // Note: the IR at this point is broken. We cannot use SE to create any new
3208   // SCEV and then expand it, hoping that SCEV's simplification will give us
3209   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3210   // lead to various SCEV crashes. So all we can do is to use builder and rely
3211   // on InstCombine for future simplifications. Here we handle some trivial
3212   // cases only.
3213   auto CreateAdd = [&B](Value *X, Value *Y) {
3214     assert(X->getType() == Y->getType() && "Types don't match!");
3215     if (auto *CX = dyn_cast<ConstantInt>(X))
3216       if (CX->isZero())
3217         return Y;
3218     if (auto *CY = dyn_cast<ConstantInt>(Y))
3219       if (CY->isZero())
3220         return X;
3221     return B.CreateAdd(X, Y);
3222   };
3223 
3224   auto CreateMul = [&B](Value *X, Value *Y) {
3225     assert(X->getType() == Y->getType() && "Types don't match!");
3226     if (auto *CX = dyn_cast<ConstantInt>(X))
3227       if (CX->isOne())
3228         return Y;
3229     if (auto *CY = dyn_cast<ConstantInt>(Y))
3230       if (CY->isOne())
3231         return X;
3232     return B.CreateMul(X, Y);
3233   };
3234 
3235   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3236   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3237   // the DomTree is not kept up-to-date for additional blocks generated in the
3238   // vector loop. By using the header as insertion point, we guarantee that the
3239   // expanded instructions dominate all their uses.
3240   auto GetInsertPoint = [this, &B]() {
3241     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3242     if (InsertBB != LoopVectorBody &&
3243         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3244       return LoopVectorBody->getTerminator();
3245     return &*B.GetInsertPoint();
3246   };
3247   switch (ID.getKind()) {
3248   case InductionDescriptor::IK_IntInduction: {
3249     assert(Index->getType() == StartValue->getType() &&
3250            "Index type does not match StartValue type");
3251     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3252       return B.CreateSub(StartValue, Index);
3253     auto *Offset = CreateMul(
3254         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3255     return CreateAdd(StartValue, Offset);
3256   }
3257   case InductionDescriptor::IK_PtrInduction: {
3258     assert(isa<SCEVConstant>(Step) &&
3259            "Expected constant step for pointer induction");
3260     return B.CreateGEP(
3261         StartValue->getType()->getPointerElementType(), StartValue,
3262         CreateMul(Index,
3263                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3264   }
3265   case InductionDescriptor::IK_FpInduction: {
3266     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3267     auto InductionBinOp = ID.getInductionBinOp();
3268     assert(InductionBinOp &&
3269            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3270             InductionBinOp->getOpcode() == Instruction::FSub) &&
3271            "Original bin op should be defined for FP induction");
3272 
3273     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3274 
3275     // Floating point operations had to be 'fast' to enable the induction.
3276     FastMathFlags Flags;
3277     Flags.setFast();
3278 
3279     Value *MulExp = B.CreateFMul(StepValue, Index);
3280     if (isa<Instruction>(MulExp))
3281       // We have to check, the MulExp may be a constant.
3282       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3283 
3284     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3285                                "induction");
3286     if (isa<Instruction>(BOp))
3287       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3288 
3289     return BOp;
3290   }
3291   case InductionDescriptor::IK_NoInduction:
3292     return nullptr;
3293   }
3294   llvm_unreachable("invalid enum");
3295 }
3296 
3297 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3298   LoopScalarBody = OrigLoop->getHeader();
3299   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3300   LoopExitBlock = OrigLoop->getExitBlock();
3301   assert(LoopExitBlock && "Must have an exit block");
3302   assert(LoopVectorPreHeader && "Invalid loop structure");
3303 
3304   LoopMiddleBlock =
3305       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3306                  LI, nullptr, Twine(Prefix) + "middle.block");
3307   LoopScalarPreHeader =
3308       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3309                  nullptr, Twine(Prefix) + "scalar.ph");
3310   // We intentionally don't let SplitBlock to update LoopInfo since
3311   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3312   // LoopVectorBody is explicitly added to the correct place few lines later.
3313   LoopVectorBody =
3314       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3315                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3316 
3317   // Update dominator for loop exit.
3318   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3319 
3320   // Create and register the new vector loop.
3321   Loop *Lp = LI->AllocateLoop();
3322   Loop *ParentLoop = OrigLoop->getParentLoop();
3323 
3324   // Insert the new loop into the loop nest and register the new basic blocks
3325   // before calling any utilities such as SCEV that require valid LoopInfo.
3326   if (ParentLoop) {
3327     ParentLoop->addChildLoop(Lp);
3328   } else {
3329     LI->addTopLevelLoop(Lp);
3330   }
3331   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3332   return Lp;
3333 }
3334 
3335 void InnerLoopVectorizer::createInductionResumeValues(
3336     Loop *L, Value *VectorTripCount,
3337     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3338   assert(VectorTripCount && L && "Expected valid arguments");
3339   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3340           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3341          "Inconsistent information about additional bypass.");
3342   // We are going to resume the execution of the scalar loop.
3343   // Go over all of the induction variables that we found and fix the
3344   // PHIs that are left in the scalar version of the loop.
3345   // The starting values of PHI nodes depend on the counter of the last
3346   // iteration in the vectorized loop.
3347   // If we come from a bypass edge then we need to start from the original
3348   // start value.
3349   for (auto &InductionEntry : Legal->getInductionVars()) {
3350     PHINode *OrigPhi = InductionEntry.first;
3351     InductionDescriptor II = InductionEntry.second;
3352 
3353     // Create phi nodes to merge from the  backedge-taken check block.
3354     PHINode *BCResumeVal =
3355         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3356                         LoopScalarPreHeader->getTerminator());
3357     // Copy original phi DL over to the new one.
3358     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3359     Value *&EndValue = IVEndValues[OrigPhi];
3360     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3361     if (OrigPhi == OldInduction) {
3362       // We know what the end value is.
3363       EndValue = VectorTripCount;
3364     } else {
3365       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3366       Type *StepType = II.getStep()->getType();
3367       Instruction::CastOps CastOp =
3368           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3369       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3370       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3371       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3372       EndValue->setName("ind.end");
3373 
3374       // Compute the end value for the additional bypass (if applicable).
3375       if (AdditionalBypass.first) {
3376         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3377         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3378                                          StepType, true);
3379         CRD =
3380             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3381         EndValueFromAdditionalBypass =
3382             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3383         EndValueFromAdditionalBypass->setName("ind.end");
3384       }
3385     }
3386     // The new PHI merges the original incoming value, in case of a bypass,
3387     // or the value at the end of the vectorized loop.
3388     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3389 
3390     // Fix the scalar body counter (PHI node).
3391     // The old induction's phi node in the scalar body needs the truncated
3392     // value.
3393     for (BasicBlock *BB : LoopBypassBlocks)
3394       BCResumeVal->addIncoming(II.getStartValue(), BB);
3395 
3396     if (AdditionalBypass.first)
3397       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3398                                             EndValueFromAdditionalBypass);
3399 
3400     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3401   }
3402 }
3403 
3404 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3405                                                       MDNode *OrigLoopID) {
3406   assert(L && "Expected valid loop.");
3407 
3408   // The trip counts should be cached by now.
3409   Value *Count = getOrCreateTripCount(L);
3410   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3411 
3412   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3413 
3414   // Add a check in the middle block to see if we have completed
3415   // all of the iterations in the first vector loop.
3416   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3417   // If tail is to be folded, we know we don't need to run the remainder.
3418   Value *CmpN = Builder.getTrue();
3419   if (!Cost->foldTailByMasking()) {
3420     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3421                            VectorTripCount, "cmp.n",
3422                            LoopMiddleBlock->getTerminator());
3423 
3424     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3425     // of the corresponding compare because they may have ended up with
3426     // different line numbers and we want to avoid awkward line stepping while
3427     // debugging. Eg. if the compare has got a line number inside the loop.
3428     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3429   }
3430 
3431   BranchInst *BrInst =
3432       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3433   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3434   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3435 
3436   // Get ready to start creating new instructions into the vectorized body.
3437   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3438          "Inconsistent vector loop preheader");
3439   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3440 
3441   Optional<MDNode *> VectorizedLoopID =
3442       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3443                                       LLVMLoopVectorizeFollowupVectorized});
3444   if (VectorizedLoopID.hasValue()) {
3445     L->setLoopID(VectorizedLoopID.getValue());
3446 
3447     // Do not setAlreadyVectorized if loop attributes have been defined
3448     // explicitly.
3449     return LoopVectorPreHeader;
3450   }
3451 
3452   // Keep all loop hints from the original loop on the vector loop (we'll
3453   // replace the vectorizer-specific hints below).
3454   if (MDNode *LID = OrigLoop->getLoopID())
3455     L->setLoopID(LID);
3456 
3457   LoopVectorizeHints Hints(L, true, *ORE);
3458   Hints.setAlreadyVectorized();
3459 
3460 #ifdef EXPENSIVE_CHECKS
3461   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3462   LI->verify(*DT);
3463 #endif
3464 
3465   return LoopVectorPreHeader;
3466 }
3467 
3468 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3469   /*
3470    In this function we generate a new loop. The new loop will contain
3471    the vectorized instructions while the old loop will continue to run the
3472    scalar remainder.
3473 
3474        [ ] <-- loop iteration number check.
3475     /   |
3476    /    v
3477   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3478   |  /  |
3479   | /   v
3480   ||   [ ]     <-- vector pre header.
3481   |/    |
3482   |     v
3483   |    [  ] \
3484   |    [  ]_|   <-- vector loop.
3485   |     |
3486   |     v
3487   |   -[ ]   <--- middle-block.
3488   |  /  |
3489   | /   v
3490   -|- >[ ]     <--- new preheader.
3491    |    |
3492    |    v
3493    |   [ ] \
3494    |   [ ]_|   <-- old scalar loop to handle remainder.
3495     \   |
3496      \  v
3497       >[ ]     <-- exit block.
3498    ...
3499    */
3500 
3501   // Get the metadata of the original loop before it gets modified.
3502   MDNode *OrigLoopID = OrigLoop->getLoopID();
3503 
3504   // Create an empty vector loop, and prepare basic blocks for the runtime
3505   // checks.
3506   Loop *Lp = createVectorLoopSkeleton("");
3507 
3508   // Now, compare the new count to zero. If it is zero skip the vector loop and
3509   // jump to the scalar loop. This check also covers the case where the
3510   // backedge-taken count is uint##_max: adding one to it will overflow leading
3511   // to an incorrect trip count of zero. In this (rare) case we will also jump
3512   // to the scalar loop.
3513   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3514 
3515   // Generate the code to check any assumptions that we've made for SCEV
3516   // expressions.
3517   emitSCEVChecks(Lp, LoopScalarPreHeader);
3518 
3519   // Generate the code that checks in runtime if arrays overlap. We put the
3520   // checks into a separate block to make the more common case of few elements
3521   // faster.
3522   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3523 
3524   // Some loops have a single integer induction variable, while other loops
3525   // don't. One example is c++ iterators that often have multiple pointer
3526   // induction variables. In the code below we also support a case where we
3527   // don't have a single induction variable.
3528   //
3529   // We try to obtain an induction variable from the original loop as hard
3530   // as possible. However if we don't find one that:
3531   //   - is an integer
3532   //   - counts from zero, stepping by one
3533   //   - is the size of the widest induction variable type
3534   // then we create a new one.
3535   OldInduction = Legal->getPrimaryInduction();
3536   Type *IdxTy = Legal->getWidestInductionType();
3537   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3538   // The loop step is equal to the vectorization factor (num of SIMD elements)
3539   // times the unroll factor (num of SIMD instructions).
3540   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3541   Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3542   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3543   Induction =
3544       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3545                               getDebugLocFromInstOrOperands(OldInduction));
3546 
3547   // Emit phis for the new starting index of the scalar loop.
3548   createInductionResumeValues(Lp, CountRoundDown);
3549 
3550   return completeLoopSkeleton(Lp, OrigLoopID);
3551 }
3552 
3553 // Fix up external users of the induction variable. At this point, we are
3554 // in LCSSA form, with all external PHIs that use the IV having one input value,
3555 // coming from the remainder loop. We need those PHIs to also have a correct
3556 // value for the IV when arriving directly from the middle block.
3557 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3558                                        const InductionDescriptor &II,
3559                                        Value *CountRoundDown, Value *EndValue,
3560                                        BasicBlock *MiddleBlock) {
3561   // There are two kinds of external IV usages - those that use the value
3562   // computed in the last iteration (the PHI) and those that use the penultimate
3563   // value (the value that feeds into the phi from the loop latch).
3564   // We allow both, but they, obviously, have different values.
3565 
3566   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3567 
3568   DenseMap<Value *, Value *> MissingVals;
3569 
3570   // An external user of the last iteration's value should see the value that
3571   // the remainder loop uses to initialize its own IV.
3572   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3573   for (User *U : PostInc->users()) {
3574     Instruction *UI = cast<Instruction>(U);
3575     if (!OrigLoop->contains(UI)) {
3576       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3577       MissingVals[UI] = EndValue;
3578     }
3579   }
3580 
3581   // An external user of the penultimate value need to see EndValue - Step.
3582   // The simplest way to get this is to recompute it from the constituent SCEVs,
3583   // that is Start + (Step * (CRD - 1)).
3584   for (User *U : OrigPhi->users()) {
3585     auto *UI = cast<Instruction>(U);
3586     if (!OrigLoop->contains(UI)) {
3587       const DataLayout &DL =
3588           OrigLoop->getHeader()->getModule()->getDataLayout();
3589       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3590 
3591       IRBuilder<> B(MiddleBlock->getTerminator());
3592       Value *CountMinusOne = B.CreateSub(
3593           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3594       Value *CMO =
3595           !II.getStep()->getType()->isIntegerTy()
3596               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3597                              II.getStep()->getType())
3598               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3599       CMO->setName("cast.cmo");
3600       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3601       Escape->setName("ind.escape");
3602       MissingVals[UI] = Escape;
3603     }
3604   }
3605 
3606   for (auto &I : MissingVals) {
3607     PHINode *PHI = cast<PHINode>(I.first);
3608     // One corner case we have to handle is two IVs "chasing" each-other,
3609     // that is %IV2 = phi [...], [ %IV1, %latch ]
3610     // In this case, if IV1 has an external use, we need to avoid adding both
3611     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3612     // don't already have an incoming value for the middle block.
3613     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3614       PHI->addIncoming(I.second, MiddleBlock);
3615   }
3616 }
3617 
3618 namespace {
3619 
3620 struct CSEDenseMapInfo {
3621   static bool canHandle(const Instruction *I) {
3622     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3623            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3624   }
3625 
3626   static inline Instruction *getEmptyKey() {
3627     return DenseMapInfo<Instruction *>::getEmptyKey();
3628   }
3629 
3630   static inline Instruction *getTombstoneKey() {
3631     return DenseMapInfo<Instruction *>::getTombstoneKey();
3632   }
3633 
3634   static unsigned getHashValue(const Instruction *I) {
3635     assert(canHandle(I) && "Unknown instruction!");
3636     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3637                                                            I->value_op_end()));
3638   }
3639 
3640   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3641     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3642         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3643       return LHS == RHS;
3644     return LHS->isIdenticalTo(RHS);
3645   }
3646 };
3647 
3648 } // end anonymous namespace
3649 
3650 ///Perform cse of induction variable instructions.
3651 static void cse(BasicBlock *BB) {
3652   // Perform simple cse.
3653   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3654   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3655     Instruction *In = &*I++;
3656 
3657     if (!CSEDenseMapInfo::canHandle(In))
3658       continue;
3659 
3660     // Check if we can replace this instruction with any of the
3661     // visited instructions.
3662     if (Instruction *V = CSEMap.lookup(In)) {
3663       In->replaceAllUsesWith(V);
3664       In->eraseFromParent();
3665       continue;
3666     }
3667 
3668     CSEMap[In] = In;
3669   }
3670 }
3671 
3672 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3673                                                        ElementCount VF,
3674                                                        bool &NeedToScalarize) {
3675   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3676   Function *F = CI->getCalledFunction();
3677   Type *ScalarRetTy = CI->getType();
3678   SmallVector<Type *, 4> Tys, ScalarTys;
3679   for (auto &ArgOp : CI->arg_operands())
3680     ScalarTys.push_back(ArgOp->getType());
3681 
3682   // Estimate cost of scalarized vector call. The source operands are assumed
3683   // to be vectors, so we need to extract individual elements from there,
3684   // execute VF scalar calls, and then gather the result into the vector return
3685   // value.
3686   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3687                                                  TTI::TCK_RecipThroughput);
3688   if (VF.isScalar())
3689     return ScalarCallCost;
3690 
3691   // Compute corresponding vector type for return value and arguments.
3692   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3693   for (Type *ScalarTy : ScalarTys)
3694     Tys.push_back(ToVectorTy(ScalarTy, VF));
3695 
3696   // Compute costs of unpacking argument values for the scalar calls and
3697   // packing the return values to a vector.
3698   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3699 
3700   unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3701 
3702   // If we can't emit a vector call for this function, then the currently found
3703   // cost is the cost we need to return.
3704   NeedToScalarize = true;
3705   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3706   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3707 
3708   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3709     return Cost;
3710 
3711   // If the corresponding vector cost is cheaper, return its cost.
3712   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3713                                                  TTI::TCK_RecipThroughput);
3714   if (VectorCallCost < Cost) {
3715     NeedToScalarize = false;
3716     return VectorCallCost;
3717   }
3718   return Cost;
3719 }
3720 
3721 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3722                                                             ElementCount VF) {
3723   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3724   assert(ID && "Expected intrinsic call!");
3725 
3726   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3727   return TTI.getIntrinsicInstrCost(CostAttrs,
3728                                    TargetTransformInfo::TCK_RecipThroughput);
3729 }
3730 
3731 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3732   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3733   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3734   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3735 }
3736 
3737 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3738   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3739   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3740   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3741 }
3742 
3743 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3744   // For every instruction `I` in MinBWs, truncate the operands, create a
3745   // truncated version of `I` and reextend its result. InstCombine runs
3746   // later and will remove any ext/trunc pairs.
3747   SmallPtrSet<Value *, 4> Erased;
3748   for (const auto &KV : Cost->getMinimalBitwidths()) {
3749     // If the value wasn't vectorized, we must maintain the original scalar
3750     // type. The absence of the value from VectorLoopValueMap indicates that it
3751     // wasn't vectorized.
3752     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3753       continue;
3754     for (unsigned Part = 0; Part < UF; ++Part) {
3755       Value *I = getOrCreateVectorValue(KV.first, Part);
3756       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3757         continue;
3758       Type *OriginalTy = I->getType();
3759       Type *ScalarTruncatedTy =
3760           IntegerType::get(OriginalTy->getContext(), KV.second);
3761       auto *TruncatedTy = FixedVectorType::get(
3762           ScalarTruncatedTy,
3763           cast<FixedVectorType>(OriginalTy)->getNumElements());
3764       if (TruncatedTy == OriginalTy)
3765         continue;
3766 
3767       IRBuilder<> B(cast<Instruction>(I));
3768       auto ShrinkOperand = [&](Value *V) -> Value * {
3769         if (auto *ZI = dyn_cast<ZExtInst>(V))
3770           if (ZI->getSrcTy() == TruncatedTy)
3771             return ZI->getOperand(0);
3772         return B.CreateZExtOrTrunc(V, TruncatedTy);
3773       };
3774 
3775       // The actual instruction modification depends on the instruction type,
3776       // unfortunately.
3777       Value *NewI = nullptr;
3778       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3779         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3780                              ShrinkOperand(BO->getOperand(1)));
3781 
3782         // Any wrapping introduced by shrinking this operation shouldn't be
3783         // considered undefined behavior. So, we can't unconditionally copy
3784         // arithmetic wrapping flags to NewI.
3785         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3786       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3787         NewI =
3788             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3789                          ShrinkOperand(CI->getOperand(1)));
3790       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3791         NewI = B.CreateSelect(SI->getCondition(),
3792                               ShrinkOperand(SI->getTrueValue()),
3793                               ShrinkOperand(SI->getFalseValue()));
3794       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3795         switch (CI->getOpcode()) {
3796         default:
3797           llvm_unreachable("Unhandled cast!");
3798         case Instruction::Trunc:
3799           NewI = ShrinkOperand(CI->getOperand(0));
3800           break;
3801         case Instruction::SExt:
3802           NewI = B.CreateSExtOrTrunc(
3803               CI->getOperand(0),
3804               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3805           break;
3806         case Instruction::ZExt:
3807           NewI = B.CreateZExtOrTrunc(
3808               CI->getOperand(0),
3809               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3810           break;
3811         }
3812       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3813         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3814                              ->getNumElements();
3815         auto *O0 = B.CreateZExtOrTrunc(
3816             SI->getOperand(0),
3817             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3818         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3819                              ->getNumElements();
3820         auto *O1 = B.CreateZExtOrTrunc(
3821             SI->getOperand(1),
3822             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3823 
3824         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3825       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3826         // Don't do anything with the operands, just extend the result.
3827         continue;
3828       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3829         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3830                             ->getNumElements();
3831         auto *O0 = B.CreateZExtOrTrunc(
3832             IE->getOperand(0),
3833             FixedVectorType::get(ScalarTruncatedTy, Elements));
3834         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3835         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3836       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3837         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3838                             ->getNumElements();
3839         auto *O0 = B.CreateZExtOrTrunc(
3840             EE->getOperand(0),
3841             FixedVectorType::get(ScalarTruncatedTy, Elements));
3842         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3843       } else {
3844         // If we don't know what to do, be conservative and don't do anything.
3845         continue;
3846       }
3847 
3848       // Lastly, extend the result.
3849       NewI->takeName(cast<Instruction>(I));
3850       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3851       I->replaceAllUsesWith(Res);
3852       cast<Instruction>(I)->eraseFromParent();
3853       Erased.insert(I);
3854       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3855     }
3856   }
3857 
3858   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3859   for (const auto &KV : Cost->getMinimalBitwidths()) {
3860     // If the value wasn't vectorized, we must maintain the original scalar
3861     // type. The absence of the value from VectorLoopValueMap indicates that it
3862     // wasn't vectorized.
3863     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3864       continue;
3865     for (unsigned Part = 0; Part < UF; ++Part) {
3866       Value *I = getOrCreateVectorValue(KV.first, Part);
3867       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3868       if (Inst && Inst->use_empty()) {
3869         Value *NewI = Inst->getOperand(0);
3870         Inst->eraseFromParent();
3871         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3872       }
3873     }
3874   }
3875 }
3876 
3877 void InnerLoopVectorizer::fixVectorizedLoop() {
3878   // Insert truncates and extends for any truncated instructions as hints to
3879   // InstCombine.
3880   if (VF.isVector())
3881     truncateToMinimalBitwidths();
3882 
3883   // Fix widened non-induction PHIs by setting up the PHI operands.
3884   if (OrigPHIsToFix.size()) {
3885     assert(EnableVPlanNativePath &&
3886            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3887     fixNonInductionPHIs();
3888   }
3889 
3890   // At this point every instruction in the original loop is widened to a
3891   // vector form. Now we need to fix the recurrences in the loop. These PHI
3892   // nodes are currently empty because we did not want to introduce cycles.
3893   // This is the second stage of vectorizing recurrences.
3894   fixCrossIterationPHIs();
3895 
3896   // Forget the original basic block.
3897   PSE.getSE()->forgetLoop(OrigLoop);
3898 
3899   // Fix-up external users of the induction variables.
3900   for (auto &Entry : Legal->getInductionVars())
3901     fixupIVUsers(Entry.first, Entry.second,
3902                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3903                  IVEndValues[Entry.first], LoopMiddleBlock);
3904 
3905   fixLCSSAPHIs();
3906   for (Instruction *PI : PredicatedInstructions)
3907     sinkScalarOperands(&*PI);
3908 
3909   // Remove redundant induction instructions.
3910   cse(LoopVectorBody);
3911 
3912   // Set/update profile weights for the vector and remainder loops as original
3913   // loop iterations are now distributed among them. Note that original loop
3914   // represented by LoopScalarBody becomes remainder loop after vectorization.
3915   //
3916   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3917   // end up getting slightly roughened result but that should be OK since
3918   // profile is not inherently precise anyway. Note also possible bypass of
3919   // vector code caused by legality checks is ignored, assigning all the weight
3920   // to the vector loop, optimistically.
3921   //
3922   // For scalable vectorization we can't know at compile time how many iterations
3923   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3924   // vscale of '1'.
3925   setProfileInfoAfterUnrolling(
3926       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3927       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3928 }
3929 
3930 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3931   // In order to support recurrences we need to be able to vectorize Phi nodes.
3932   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3933   // stage #2: We now need to fix the recurrences by adding incoming edges to
3934   // the currently empty PHI nodes. At this point every instruction in the
3935   // original loop is widened to a vector form so we can use them to construct
3936   // the incoming edges.
3937   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3938     // Handle first-order recurrences and reductions that need to be fixed.
3939     if (Legal->isFirstOrderRecurrence(&Phi))
3940       fixFirstOrderRecurrence(&Phi);
3941     else if (Legal->isReductionVariable(&Phi))
3942       fixReduction(&Phi);
3943   }
3944 }
3945 
3946 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3947   // This is the second phase of vectorizing first-order recurrences. An
3948   // overview of the transformation is described below. Suppose we have the
3949   // following loop.
3950   //
3951   //   for (int i = 0; i < n; ++i)
3952   //     b[i] = a[i] - a[i - 1];
3953   //
3954   // There is a first-order recurrence on "a". For this loop, the shorthand
3955   // scalar IR looks like:
3956   //
3957   //   scalar.ph:
3958   //     s_init = a[-1]
3959   //     br scalar.body
3960   //
3961   //   scalar.body:
3962   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3963   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3964   //     s2 = a[i]
3965   //     b[i] = s2 - s1
3966   //     br cond, scalar.body, ...
3967   //
3968   // In this example, s1 is a recurrence because it's value depends on the
3969   // previous iteration. In the first phase of vectorization, we created a
3970   // temporary value for s1. We now complete the vectorization and produce the
3971   // shorthand vector IR shown below (for VF = 4, UF = 1).
3972   //
3973   //   vector.ph:
3974   //     v_init = vector(..., ..., ..., a[-1])
3975   //     br vector.body
3976   //
3977   //   vector.body
3978   //     i = phi [0, vector.ph], [i+4, vector.body]
3979   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3980   //     v2 = a[i, i+1, i+2, i+3];
3981   //     v3 = vector(v1(3), v2(0, 1, 2))
3982   //     b[i, i+1, i+2, i+3] = v2 - v3
3983   //     br cond, vector.body, middle.block
3984   //
3985   //   middle.block:
3986   //     x = v2(3)
3987   //     br scalar.ph
3988   //
3989   //   scalar.ph:
3990   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3991   //     br scalar.body
3992   //
3993   // After execution completes the vector loop, we extract the next value of
3994   // the recurrence (x) to use as the initial value in the scalar loop.
3995 
3996   // Get the original loop preheader and single loop latch.
3997   auto *Preheader = OrigLoop->getLoopPreheader();
3998   auto *Latch = OrigLoop->getLoopLatch();
3999 
4000   // Get the initial and previous values of the scalar recurrence.
4001   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4002   auto *Previous = Phi->getIncomingValueForBlock(Latch);
4003 
4004   // Create a vector from the initial value.
4005   auto *VectorInit = ScalarInit;
4006   if (VF.isVector()) {
4007     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4008     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4009     VectorInit = Builder.CreateInsertElement(
4010         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4011         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
4012   }
4013 
4014   // We constructed a temporary phi node in the first phase of vectorization.
4015   // This phi node will eventually be deleted.
4016   Builder.SetInsertPoint(
4017       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4018 
4019   // Create a phi node for the new recurrence. The current value will either be
4020   // the initial value inserted into a vector or loop-varying vector value.
4021   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4022   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4023 
4024   // Get the vectorized previous value of the last part UF - 1. It appears last
4025   // among all unrolled iterations, due to the order of their construction.
4026   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4027 
4028   // Find and set the insertion point after the previous value if it is an
4029   // instruction.
4030   BasicBlock::iterator InsertPt;
4031   // Note that the previous value may have been constant-folded so it is not
4032   // guaranteed to be an instruction in the vector loop.
4033   // FIXME: Loop invariant values do not form recurrences. We should deal with
4034   //        them earlier.
4035   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4036     InsertPt = LoopVectorBody->getFirstInsertionPt();
4037   else {
4038     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4039     if (isa<PHINode>(PreviousLastPart))
4040       // If the previous value is a phi node, we should insert after all the phi
4041       // nodes in the block containing the PHI to avoid breaking basic block
4042       // verification. Note that the basic block may be different to
4043       // LoopVectorBody, in case we predicate the loop.
4044       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4045     else
4046       InsertPt = ++PreviousInst->getIterator();
4047   }
4048   Builder.SetInsertPoint(&*InsertPt);
4049 
4050   // We will construct a vector for the recurrence by combining the values for
4051   // the current and previous iterations. This is the required shuffle mask.
4052   assert(!VF.isScalable());
4053   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4054   ShuffleMask[0] = VF.getKnownMinValue() - 1;
4055   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4056     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4057 
4058   // The vector from which to take the initial value for the current iteration
4059   // (actual or unrolled). Initially, this is the vector phi node.
4060   Value *Incoming = VecPhi;
4061 
4062   // Shuffle the current and previous vector and update the vector parts.
4063   for (unsigned Part = 0; Part < UF; ++Part) {
4064     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4065     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4066     auto *Shuffle =
4067         VF.isVector()
4068             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4069             : Incoming;
4070     PhiPart->replaceAllUsesWith(Shuffle);
4071     cast<Instruction>(PhiPart)->eraseFromParent();
4072     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4073     Incoming = PreviousPart;
4074   }
4075 
4076   // Fix the latch value of the new recurrence in the vector loop.
4077   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4078 
4079   // Extract the last vector element in the middle block. This will be the
4080   // initial value for the recurrence when jumping to the scalar loop.
4081   auto *ExtractForScalar = Incoming;
4082   if (VF.isVector()) {
4083     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4084     ExtractForScalar = Builder.CreateExtractElement(
4085         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4086         "vector.recur.extract");
4087   }
4088   // Extract the second last element in the middle block if the
4089   // Phi is used outside the loop. We need to extract the phi itself
4090   // and not the last element (the phi update in the current iteration). This
4091   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4092   // when the scalar loop is not run at all.
4093   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4094   if (VF.isVector())
4095     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4096         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4097         "vector.recur.extract.for.phi");
4098   // When loop is unrolled without vectorizing, initialize
4099   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4100   // `Incoming`. This is analogous to the vectorized case above: extracting the
4101   // second last element when VF > 1.
4102   else if (UF > 1)
4103     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4104 
4105   // Fix the initial value of the original recurrence in the scalar loop.
4106   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4107   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4108   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4109     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4110     Start->addIncoming(Incoming, BB);
4111   }
4112 
4113   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4114   Phi->setName("scalar.recur");
4115 
4116   // Finally, fix users of the recurrence outside the loop. The users will need
4117   // either the last value of the scalar recurrence or the last value of the
4118   // vector recurrence we extracted in the middle block. Since the loop is in
4119   // LCSSA form, we just need to find all the phi nodes for the original scalar
4120   // recurrence in the exit block, and then add an edge for the middle block.
4121   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4122     if (LCSSAPhi.getIncomingValue(0) == Phi) {
4123       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4124     }
4125   }
4126 }
4127 
4128 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
4129   Constant *Zero = Builder.getInt32(0);
4130 
4131   // Get it's reduction variable descriptor.
4132   assert(Legal->isReductionVariable(Phi) &&
4133          "Unable to find the reduction variable");
4134   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4135 
4136   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4137   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4138   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4139   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
4140     RdxDesc.getMinMaxRecurrenceKind();
4141   setDebugLocFromInst(Builder, ReductionStartValue);
4142   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4143 
4144   // We need to generate a reduction vector from the incoming scalar.
4145   // To do so, we need to generate the 'identity' vector and override
4146   // one of the elements with the incoming scalar reduction. We need
4147   // to do it in the vector-loop preheader.
4148   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4149 
4150   // This is the vector-clone of the value that leaves the loop.
4151   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4152 
4153   // Find the reduction identity variable. Zero for addition, or, xor,
4154   // one for multiplication, -1 for And.
4155   Value *Identity;
4156   Value *VectorStart;
4157   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
4158       RK == RecurrenceDescriptor::RK_FloatMinMax) {
4159     // MinMax reduction have the start value as their identify.
4160     if (VF.isScalar() || IsInLoopReductionPhi) {
4161       VectorStart = Identity = ReductionStartValue;
4162     } else {
4163       VectorStart = Identity =
4164         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
4165     }
4166   } else {
4167     // Handle other reduction kinds:
4168     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
4169         RK, MinMaxKind, VecTy->getScalarType());
4170     if (VF.isScalar() || IsInLoopReductionPhi) {
4171       Identity = Iden;
4172       // This vector is the Identity vector where the first element is the
4173       // incoming scalar reduction.
4174       VectorStart = ReductionStartValue;
4175     } else {
4176       Identity = ConstantVector::getSplat(VF, Iden);
4177 
4178       // This vector is the Identity vector where the first element is the
4179       // incoming scalar reduction.
4180       VectorStart =
4181         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
4182     }
4183   }
4184 
4185   // Wrap flags are in general invalid after vectorization, clear them.
4186   clearReductionWrapFlags(RdxDesc);
4187 
4188   // Fix the vector-loop phi.
4189 
4190   // Reductions do not have to start at zero. They can start with
4191   // any loop invariant values.
4192   BasicBlock *Latch = OrigLoop->getLoopLatch();
4193   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4194 
4195   for (unsigned Part = 0; Part < UF; ++Part) {
4196     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4197     Value *Val = getOrCreateVectorValue(LoopVal, Part);
4198     // Make sure to add the reduction start value only to the
4199     // first unroll part.
4200     Value *StartVal = (Part == 0) ? VectorStart : Identity;
4201     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
4202     cast<PHINode>(VecRdxPhi)
4203       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4204   }
4205 
4206   // Before each round, move the insertion point right between
4207   // the PHIs and the values we are going to write.
4208   // This allows us to write both PHINodes and the extractelement
4209   // instructions.
4210   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4211 
4212   setDebugLocFromInst(Builder, LoopExitInst);
4213 
4214   // If tail is folded by masking, the vector value to leave the loop should be
4215   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4216   // instead of the former. For an inloop reduction the reduction will already
4217   // be predicated, and does not need to be handled here.
4218   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4219     for (unsigned Part = 0; Part < UF; ++Part) {
4220       Value *VecLoopExitInst =
4221           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4222       Value *Sel = nullptr;
4223       for (User *U : VecLoopExitInst->users()) {
4224         if (isa<SelectInst>(U)) {
4225           assert(!Sel && "Reduction exit feeding two selects");
4226           Sel = U;
4227         } else
4228           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4229       }
4230       assert(Sel && "Reduction exit feeds no select");
4231       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4232 
4233       // If the target can create a predicated operator for the reduction at no
4234       // extra cost in the loop (for example a predicated vadd), it can be
4235       // cheaper for the select to remain in the loop than be sunk out of it,
4236       // and so use the select value for the phi instead of the old
4237       // LoopExitValue.
4238       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4239       if (PreferPredicatedReductionSelect ||
4240           TTI->preferPredicatedReductionSelect(
4241               RdxDesc.getRecurrenceBinOp(), Phi->getType(),
4242               TargetTransformInfo::ReductionFlags())) {
4243         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4244         VecRdxPhi->setIncomingValueForBlock(
4245             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4246       }
4247     }
4248   }
4249 
4250   // If the vector reduction can be performed in a smaller type, we truncate
4251   // then extend the loop exit value to enable InstCombine to evaluate the
4252   // entire expression in the smaller type.
4253   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4254     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4255     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4256     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4257     Builder.SetInsertPoint(
4258         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4259     VectorParts RdxParts(UF);
4260     for (unsigned Part = 0; Part < UF; ++Part) {
4261       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4262       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4263       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4264                                         : Builder.CreateZExt(Trunc, VecTy);
4265       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4266            UI != RdxParts[Part]->user_end();)
4267         if (*UI != Trunc) {
4268           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4269           RdxParts[Part] = Extnd;
4270         } else {
4271           ++UI;
4272         }
4273     }
4274     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4275     for (unsigned Part = 0; Part < UF; ++Part) {
4276       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4277       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4278     }
4279   }
4280 
4281   // Reduce all of the unrolled parts into a single vector.
4282   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4283   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4284 
4285   // The middle block terminator has already been assigned a DebugLoc here (the
4286   // OrigLoop's single latch terminator). We want the whole middle block to
4287   // appear to execute on this line because: (a) it is all compiler generated,
4288   // (b) these instructions are always executed after evaluating the latch
4289   // conditional branch, and (c) other passes may add new predecessors which
4290   // terminate on this line. This is the easiest way to ensure we don't
4291   // accidentally cause an extra step back into the loop while debugging.
4292   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4293   for (unsigned Part = 1; Part < UF; ++Part) {
4294     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4295     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4296       // Floating point operations had to be 'fast' to enable the reduction.
4297       ReducedPartRdx = addFastMathFlag(
4298           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4299                               ReducedPartRdx, "bin.rdx"),
4300           RdxDesc.getFastMathFlags());
4301     else
4302       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4303                                       RdxPart);
4304   }
4305 
4306   // Create the reduction after the loop. Note that inloop reductions create the
4307   // target reduction in the loop using a Reduction recipe.
4308   if (VF.isVector() && !IsInLoopReductionPhi) {
4309     bool NoNaN = Legal->hasFunNoNaNAttr();
4310     ReducedPartRdx =
4311         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4312     // If the reduction can be performed in a smaller type, we need to extend
4313     // the reduction to the wider type before we branch to the original loop.
4314     if (Phi->getType() != RdxDesc.getRecurrenceType())
4315       ReducedPartRdx =
4316         RdxDesc.isSigned()
4317         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4318         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4319   }
4320 
4321   // Create a phi node that merges control-flow from the backedge-taken check
4322   // block and the middle block.
4323   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4324                                         LoopScalarPreHeader->getTerminator());
4325   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4326     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4327   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4328 
4329   // Now, we need to fix the users of the reduction variable
4330   // inside and outside of the scalar remainder loop.
4331   // We know that the loop is in LCSSA form. We need to update the
4332   // PHI nodes in the exit blocks.
4333   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4334     // All PHINodes need to have a single entry edge, or two if
4335     // we already fixed them.
4336     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4337 
4338     // We found a reduction value exit-PHI. Update it with the
4339     // incoming bypass edge.
4340     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4341       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4342   } // end of the LCSSA phi scan.
4343 
4344     // Fix the scalar loop reduction variable with the incoming reduction sum
4345     // from the vector body and from the backedge value.
4346   int IncomingEdgeBlockIdx =
4347     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4348   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4349   // Pick the other block.
4350   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4351   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4352   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4353 }
4354 
4355 void InnerLoopVectorizer::clearReductionWrapFlags(
4356     RecurrenceDescriptor &RdxDesc) {
4357   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4358   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4359       RK != RecurrenceDescriptor::RK_IntegerMult)
4360     return;
4361 
4362   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4363   assert(LoopExitInstr && "null loop exit instruction");
4364   SmallVector<Instruction *, 8> Worklist;
4365   SmallPtrSet<Instruction *, 8> Visited;
4366   Worklist.push_back(LoopExitInstr);
4367   Visited.insert(LoopExitInstr);
4368 
4369   while (!Worklist.empty()) {
4370     Instruction *Cur = Worklist.pop_back_val();
4371     if (isa<OverflowingBinaryOperator>(Cur))
4372       for (unsigned Part = 0; Part < UF; ++Part) {
4373         Value *V = getOrCreateVectorValue(Cur, Part);
4374         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4375       }
4376 
4377     for (User *U : Cur->users()) {
4378       Instruction *UI = cast<Instruction>(U);
4379       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4380           Visited.insert(UI).second)
4381         Worklist.push_back(UI);
4382     }
4383   }
4384 }
4385 
4386 void InnerLoopVectorizer::fixLCSSAPHIs() {
4387   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4388     if (LCSSAPhi.getNumIncomingValues() == 1) {
4389       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4390       // Non-instruction incoming values will have only one value.
4391       unsigned LastLane = 0;
4392       if (isa<Instruction>(IncomingValue))
4393         LastLane = Cost->isUniformAfterVectorization(
4394                        cast<Instruction>(IncomingValue), VF)
4395                        ? 0
4396                        : VF.getKnownMinValue() - 1;
4397       assert((!VF.isScalable() || LastLane == 0) &&
4398              "scalable vectors dont support non-uniform scalars yet");
4399       // Can be a loop invariant incoming value or the last scalar value to be
4400       // extracted from the vectorized loop.
4401       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4402       Value *lastIncomingValue =
4403           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4404       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4405     }
4406   }
4407 }
4408 
4409 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4410   // The basic block and loop containing the predicated instruction.
4411   auto *PredBB = PredInst->getParent();
4412   auto *VectorLoop = LI->getLoopFor(PredBB);
4413 
4414   // Initialize a worklist with the operands of the predicated instruction.
4415   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4416 
4417   // Holds instructions that we need to analyze again. An instruction may be
4418   // reanalyzed if we don't yet know if we can sink it or not.
4419   SmallVector<Instruction *, 8> InstsToReanalyze;
4420 
4421   // Returns true if a given use occurs in the predicated block. Phi nodes use
4422   // their operands in their corresponding predecessor blocks.
4423   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4424     auto *I = cast<Instruction>(U.getUser());
4425     BasicBlock *BB = I->getParent();
4426     if (auto *Phi = dyn_cast<PHINode>(I))
4427       BB = Phi->getIncomingBlock(
4428           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4429     return BB == PredBB;
4430   };
4431 
4432   // Iteratively sink the scalarized operands of the predicated instruction
4433   // into the block we created for it. When an instruction is sunk, it's
4434   // operands are then added to the worklist. The algorithm ends after one pass
4435   // through the worklist doesn't sink a single instruction.
4436   bool Changed;
4437   do {
4438     // Add the instructions that need to be reanalyzed to the worklist, and
4439     // reset the changed indicator.
4440     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4441     InstsToReanalyze.clear();
4442     Changed = false;
4443 
4444     while (!Worklist.empty()) {
4445       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4446 
4447       // We can't sink an instruction if it is a phi node, is already in the
4448       // predicated block, is not in the loop, or may have side effects.
4449       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4450           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4451         continue;
4452 
4453       // It's legal to sink the instruction if all its uses occur in the
4454       // predicated block. Otherwise, there's nothing to do yet, and we may
4455       // need to reanalyze the instruction.
4456       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4457         InstsToReanalyze.push_back(I);
4458         continue;
4459       }
4460 
4461       // Move the instruction to the beginning of the predicated block, and add
4462       // it's operands to the worklist.
4463       I->moveBefore(&*PredBB->getFirstInsertionPt());
4464       Worklist.insert(I->op_begin(), I->op_end());
4465 
4466       // The sinking may have enabled other instructions to be sunk, so we will
4467       // need to iterate.
4468       Changed = true;
4469     }
4470   } while (Changed);
4471 }
4472 
4473 void InnerLoopVectorizer::fixNonInductionPHIs() {
4474   for (PHINode *OrigPhi : OrigPHIsToFix) {
4475     PHINode *NewPhi =
4476         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4477     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4478 
4479     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4480         predecessors(OrigPhi->getParent()));
4481     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4482         predecessors(NewPhi->getParent()));
4483     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4484            "Scalar and Vector BB should have the same number of predecessors");
4485 
4486     // The insertion point in Builder may be invalidated by the time we get
4487     // here. Force the Builder insertion point to something valid so that we do
4488     // not run into issues during insertion point restore in
4489     // getOrCreateVectorValue calls below.
4490     Builder.SetInsertPoint(NewPhi);
4491 
4492     // The predecessor order is preserved and we can rely on mapping between
4493     // scalar and vector block predecessors.
4494     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4495       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4496 
4497       // When looking up the new scalar/vector values to fix up, use incoming
4498       // values from original phi.
4499       Value *ScIncV =
4500           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4501 
4502       // Scalar incoming value may need a broadcast
4503       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4504       NewPhi->addIncoming(NewIncV, NewPredBB);
4505     }
4506   }
4507 }
4508 
4509 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4510                                    VPUser &Operands, unsigned UF,
4511                                    ElementCount VF, bool IsPtrLoopInvariant,
4512                                    SmallBitVector &IsIndexLoopInvariant,
4513                                    VPTransformState &State) {
4514   // Construct a vector GEP by widening the operands of the scalar GEP as
4515   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4516   // results in a vector of pointers when at least one operand of the GEP
4517   // is vector-typed. Thus, to keep the representation compact, we only use
4518   // vector-typed operands for loop-varying values.
4519 
4520   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4521     // If we are vectorizing, but the GEP has only loop-invariant operands,
4522     // the GEP we build (by only using vector-typed operands for
4523     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4524     // produce a vector of pointers, we need to either arbitrarily pick an
4525     // operand to broadcast, or broadcast a clone of the original GEP.
4526     // Here, we broadcast a clone of the original.
4527     //
4528     // TODO: If at some point we decide to scalarize instructions having
4529     //       loop-invariant operands, this special case will no longer be
4530     //       required. We would add the scalarization decision to
4531     //       collectLoopScalars() and teach getVectorValue() to broadcast
4532     //       the lane-zero scalar value.
4533     auto *Clone = Builder.Insert(GEP->clone());
4534     for (unsigned Part = 0; Part < UF; ++Part) {
4535       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4536       State.set(VPDef, GEP, EntryPart, Part);
4537       addMetadata(EntryPart, GEP);
4538     }
4539   } else {
4540     // If the GEP has at least one loop-varying operand, we are sure to
4541     // produce a vector of pointers. But if we are only unrolling, we want
4542     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4543     // produce with the code below will be scalar (if VF == 1) or vector
4544     // (otherwise). Note that for the unroll-only case, we still maintain
4545     // values in the vector mapping with initVector, as we do for other
4546     // instructions.
4547     for (unsigned Part = 0; Part < UF; ++Part) {
4548       // The pointer operand of the new GEP. If it's loop-invariant, we
4549       // won't broadcast it.
4550       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4551                                      : State.get(Operands.getOperand(0), Part);
4552 
4553       // Collect all the indices for the new GEP. If any index is
4554       // loop-invariant, we won't broadcast it.
4555       SmallVector<Value *, 4> Indices;
4556       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4557         VPValue *Operand = Operands.getOperand(I);
4558         if (IsIndexLoopInvariant[I - 1])
4559           Indices.push_back(State.get(Operand, {0, 0}));
4560         else
4561           Indices.push_back(State.get(Operand, Part));
4562       }
4563 
4564       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4565       // but it should be a vector, otherwise.
4566       auto *NewGEP =
4567           GEP->isInBounds()
4568               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4569                                           Indices)
4570               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4571       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4572              "NewGEP is not a pointer vector");
4573       State.set(VPDef, GEP, NewGEP, Part);
4574       addMetadata(NewGEP, GEP);
4575     }
4576   }
4577 }
4578 
4579 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4580                                               ElementCount VF) {
4581   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4582   PHINode *P = cast<PHINode>(PN);
4583   if (EnableVPlanNativePath) {
4584     // Currently we enter here in the VPlan-native path for non-induction
4585     // PHIs where all control flow is uniform. We simply widen these PHIs.
4586     // Create a vector phi with no operands - the vector phi operands will be
4587     // set at the end of vector code generation.
4588     Type *VecTy =
4589         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4590     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4591     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4592     OrigPHIsToFix.push_back(P);
4593 
4594     return;
4595   }
4596 
4597   assert(PN->getParent() == OrigLoop->getHeader() &&
4598          "Non-header phis should have been handled elsewhere");
4599 
4600   // In order to support recurrences we need to be able to vectorize Phi nodes.
4601   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4602   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4603   // this value when we vectorize all of the instructions that use the PHI.
4604   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4605     for (unsigned Part = 0; Part < UF; ++Part) {
4606       // This is phase one of vectorizing PHIs.
4607       bool ScalarPHI =
4608           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4609       Type *VecTy =
4610           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4611       Value *EntryPart = PHINode::Create(
4612           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4613       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4614     }
4615     return;
4616   }
4617 
4618   setDebugLocFromInst(Builder, P);
4619 
4620   // This PHINode must be an induction variable.
4621   // Make sure that we know about it.
4622   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4623 
4624   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4625   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4626 
4627   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4628   // which can be found from the original scalar operations.
4629   switch (II.getKind()) {
4630   case InductionDescriptor::IK_NoInduction:
4631     llvm_unreachable("Unknown induction");
4632   case InductionDescriptor::IK_IntInduction:
4633   case InductionDescriptor::IK_FpInduction:
4634     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4635   case InductionDescriptor::IK_PtrInduction: {
4636     // Handle the pointer induction variable case.
4637     assert(P->getType()->isPointerTy() && "Unexpected type.");
4638 
4639     if (Cost->isScalarAfterVectorization(P, VF)) {
4640       // This is the normalized GEP that starts counting at zero.
4641       Value *PtrInd =
4642           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4643       // Determine the number of scalars we need to generate for each unroll
4644       // iteration. If the instruction is uniform, we only need to generate the
4645       // first lane. Otherwise, we generate all VF values.
4646       unsigned Lanes =
4647           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4648       for (unsigned Part = 0; Part < UF; ++Part) {
4649         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4650           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4651                                            Lane + Part * VF.getKnownMinValue());
4652           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4653           Value *SclrGep =
4654               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4655           SclrGep->setName("next.gep");
4656           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4657         }
4658       }
4659       return;
4660     }
4661     assert(isa<SCEVConstant>(II.getStep()) &&
4662            "Induction step not a SCEV constant!");
4663     Type *PhiType = II.getStep()->getType();
4664 
4665     // Build a pointer phi
4666     Value *ScalarStartValue = II.getStartValue();
4667     Type *ScStValueType = ScalarStartValue->getType();
4668     PHINode *NewPointerPhi =
4669         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4670     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4671 
4672     // A pointer induction, performed by using a gep
4673     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4674     Instruction *InductionLoc = LoopLatch->getTerminator();
4675     const SCEV *ScalarStep = II.getStep();
4676     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4677     Value *ScalarStepValue =
4678         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4679     Value *InductionGEP = GetElementPtrInst::Create(
4680         ScStValueType->getPointerElementType(), NewPointerPhi,
4681         Builder.CreateMul(
4682             ScalarStepValue,
4683             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4684         "ptr.ind", InductionLoc);
4685     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4686 
4687     // Create UF many actual address geps that use the pointer
4688     // phi as base and a vectorized version of the step value
4689     // (<step*0, ..., step*N>) as offset.
4690     for (unsigned Part = 0; Part < UF; ++Part) {
4691       SmallVector<Constant *, 8> Indices;
4692       // Create a vector of consecutive numbers from zero to VF.
4693       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4694         Indices.push_back(
4695             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4696       Constant *StartOffset = ConstantVector::get(Indices);
4697 
4698       Value *GEP = Builder.CreateGEP(
4699           ScStValueType->getPointerElementType(), NewPointerPhi,
4700           Builder.CreateMul(
4701               StartOffset,
4702               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4703               "vector.gep"));
4704       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4705     }
4706   }
4707   }
4708 }
4709 
4710 /// A helper function for checking whether an integer division-related
4711 /// instruction may divide by zero (in which case it must be predicated if
4712 /// executed conditionally in the scalar code).
4713 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4714 /// Non-zero divisors that are non compile-time constants will not be
4715 /// converted into multiplication, so we will still end up scalarizing
4716 /// the division, but can do so w/o predication.
4717 static bool mayDivideByZero(Instruction &I) {
4718   assert((I.getOpcode() == Instruction::UDiv ||
4719           I.getOpcode() == Instruction::SDiv ||
4720           I.getOpcode() == Instruction::URem ||
4721           I.getOpcode() == Instruction::SRem) &&
4722          "Unexpected instruction");
4723   Value *Divisor = I.getOperand(1);
4724   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4725   return !CInt || CInt->isZero();
4726 }
4727 
4728 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4729                                            VPUser &User,
4730                                            VPTransformState &State) {
4731   switch (I.getOpcode()) {
4732   case Instruction::Call:
4733   case Instruction::Br:
4734   case Instruction::PHI:
4735   case Instruction::GetElementPtr:
4736   case Instruction::Select:
4737     llvm_unreachable("This instruction is handled by a different recipe.");
4738   case Instruction::UDiv:
4739   case Instruction::SDiv:
4740   case Instruction::SRem:
4741   case Instruction::URem:
4742   case Instruction::Add:
4743   case Instruction::FAdd:
4744   case Instruction::Sub:
4745   case Instruction::FSub:
4746   case Instruction::FNeg:
4747   case Instruction::Mul:
4748   case Instruction::FMul:
4749   case Instruction::FDiv:
4750   case Instruction::FRem:
4751   case Instruction::Shl:
4752   case Instruction::LShr:
4753   case Instruction::AShr:
4754   case Instruction::And:
4755   case Instruction::Or:
4756   case Instruction::Xor: {
4757     // Just widen unops and binops.
4758     setDebugLocFromInst(Builder, &I);
4759 
4760     for (unsigned Part = 0; Part < UF; ++Part) {
4761       SmallVector<Value *, 2> Ops;
4762       for (VPValue *VPOp : User.operands())
4763         Ops.push_back(State.get(VPOp, Part));
4764 
4765       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4766 
4767       if (auto *VecOp = dyn_cast<Instruction>(V))
4768         VecOp->copyIRFlags(&I);
4769 
4770       // Use this vector value for all users of the original instruction.
4771       State.set(Def, &I, V, Part);
4772       addMetadata(V, &I);
4773     }
4774 
4775     break;
4776   }
4777   case Instruction::ICmp:
4778   case Instruction::FCmp: {
4779     // Widen compares. Generate vector compares.
4780     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4781     auto *Cmp = cast<CmpInst>(&I);
4782     setDebugLocFromInst(Builder, Cmp);
4783     for (unsigned Part = 0; Part < UF; ++Part) {
4784       Value *A = State.get(User.getOperand(0), Part);
4785       Value *B = State.get(User.getOperand(1), Part);
4786       Value *C = nullptr;
4787       if (FCmp) {
4788         // Propagate fast math flags.
4789         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4790         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4791         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4792       } else {
4793         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4794       }
4795       State.set(Def, &I, C, Part);
4796       addMetadata(C, &I);
4797     }
4798 
4799     break;
4800   }
4801 
4802   case Instruction::ZExt:
4803   case Instruction::SExt:
4804   case Instruction::FPToUI:
4805   case Instruction::FPToSI:
4806   case Instruction::FPExt:
4807   case Instruction::PtrToInt:
4808   case Instruction::IntToPtr:
4809   case Instruction::SIToFP:
4810   case Instruction::UIToFP:
4811   case Instruction::Trunc:
4812   case Instruction::FPTrunc:
4813   case Instruction::BitCast: {
4814     auto *CI = cast<CastInst>(&I);
4815     setDebugLocFromInst(Builder, CI);
4816 
4817     /// Vectorize casts.
4818     Type *DestTy =
4819         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4820 
4821     for (unsigned Part = 0; Part < UF; ++Part) {
4822       Value *A = State.get(User.getOperand(0), Part);
4823       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4824       State.set(Def, &I, Cast, Part);
4825       addMetadata(Cast, &I);
4826     }
4827     break;
4828   }
4829   default:
4830     // This instruction is not vectorized by simple widening.
4831     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4832     llvm_unreachable("Unhandled instruction!");
4833   } // end of switch.
4834 }
4835 
4836 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4837                                                VPUser &ArgOperands,
4838                                                VPTransformState &State) {
4839   assert(!isa<DbgInfoIntrinsic>(I) &&
4840          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4841   setDebugLocFromInst(Builder, &I);
4842 
4843   Module *M = I.getParent()->getParent()->getParent();
4844   auto *CI = cast<CallInst>(&I);
4845 
4846   SmallVector<Type *, 4> Tys;
4847   for (Value *ArgOperand : CI->arg_operands())
4848     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4849 
4850   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4851 
4852   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4853   // version of the instruction.
4854   // Is it beneficial to perform intrinsic call compared to lib call?
4855   bool NeedToScalarize = false;
4856   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4857   bool UseVectorIntrinsic =
4858       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4859   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4860          "Instruction should be scalarized elsewhere.");
4861 
4862   for (unsigned Part = 0; Part < UF; ++Part) {
4863     SmallVector<Value *, 4> Args;
4864     for (auto &I : enumerate(ArgOperands.operands())) {
4865       // Some intrinsics have a scalar argument - don't replace it with a
4866       // vector.
4867       Value *Arg;
4868       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4869         Arg = State.get(I.value(), Part);
4870       else
4871         Arg = State.get(I.value(), {0, 0});
4872       Args.push_back(Arg);
4873     }
4874 
4875     Function *VectorF;
4876     if (UseVectorIntrinsic) {
4877       // Use vector version of the intrinsic.
4878       Type *TysForDecl[] = {CI->getType()};
4879       if (VF.isVector()) {
4880         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4881         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4882       }
4883       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4884       assert(VectorF && "Can't retrieve vector intrinsic.");
4885     } else {
4886       // Use vector version of the function call.
4887       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4888 #ifndef NDEBUG
4889       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4890              "Can't create vector function.");
4891 #endif
4892         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4893     }
4894       SmallVector<OperandBundleDef, 1> OpBundles;
4895       CI->getOperandBundlesAsDefs(OpBundles);
4896       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4897 
4898       if (isa<FPMathOperator>(V))
4899         V->copyFastMathFlags(CI);
4900 
4901       State.set(Def, &I, V, Part);
4902       addMetadata(V, &I);
4903   }
4904 }
4905 
4906 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4907                                                  VPUser &Operands,
4908                                                  bool InvariantCond,
4909                                                  VPTransformState &State) {
4910   setDebugLocFromInst(Builder, &I);
4911 
4912   // The condition can be loop invariant  but still defined inside the
4913   // loop. This means that we can't just use the original 'cond' value.
4914   // We have to take the 'vectorized' value and pick the first lane.
4915   // Instcombine will make this a no-op.
4916   auto *InvarCond =
4917       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4918 
4919   for (unsigned Part = 0; Part < UF; ++Part) {
4920     Value *Cond =
4921         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4922     Value *Op0 = State.get(Operands.getOperand(1), Part);
4923     Value *Op1 = State.get(Operands.getOperand(2), Part);
4924     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4925     State.set(VPDef, &I, Sel, Part);
4926     addMetadata(Sel, &I);
4927   }
4928 }
4929 
4930 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4931   // We should not collect Scalars more than once per VF. Right now, this
4932   // function is called from collectUniformsAndScalars(), which already does
4933   // this check. Collecting Scalars for VF=1 does not make any sense.
4934   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4935          "This function should not be visited twice for the same VF");
4936 
4937   SmallSetVector<Instruction *, 8> Worklist;
4938 
4939   // These sets are used to seed the analysis with pointers used by memory
4940   // accesses that will remain scalar.
4941   SmallSetVector<Instruction *, 8> ScalarPtrs;
4942   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4943   auto *Latch = TheLoop->getLoopLatch();
4944 
4945   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4946   // The pointer operands of loads and stores will be scalar as long as the
4947   // memory access is not a gather or scatter operation. The value operand of a
4948   // store will remain scalar if the store is scalarized.
4949   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4950     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4951     assert(WideningDecision != CM_Unknown &&
4952            "Widening decision should be ready at this moment");
4953     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4954       if (Ptr == Store->getValueOperand())
4955         return WideningDecision == CM_Scalarize;
4956     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4957            "Ptr is neither a value or pointer operand");
4958     return WideningDecision != CM_GatherScatter;
4959   };
4960 
4961   // A helper that returns true if the given value is a bitcast or
4962   // getelementptr instruction contained in the loop.
4963   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4964     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4965             isa<GetElementPtrInst>(V)) &&
4966            !TheLoop->isLoopInvariant(V);
4967   };
4968 
4969   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4970     if (!isa<PHINode>(Ptr) ||
4971         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4972       return false;
4973     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4974     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4975       return false;
4976     return isScalarUse(MemAccess, Ptr);
4977   };
4978 
4979   // A helper that evaluates a memory access's use of a pointer. If the
4980   // pointer is actually the pointer induction of a loop, it is being
4981   // inserted into Worklist. If the use will be a scalar use, and the
4982   // pointer is only used by memory accesses, we place the pointer in
4983   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4984   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4985     if (isScalarPtrInduction(MemAccess, Ptr)) {
4986       Worklist.insert(cast<Instruction>(Ptr));
4987       Instruction *Update = cast<Instruction>(
4988           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4989       Worklist.insert(Update);
4990       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4991                         << "\n");
4992       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4993                         << "\n");
4994       return;
4995     }
4996     // We only care about bitcast and getelementptr instructions contained in
4997     // the loop.
4998     if (!isLoopVaryingBitCastOrGEP(Ptr))
4999       return;
5000 
5001     // If the pointer has already been identified as scalar (e.g., if it was
5002     // also identified as uniform), there's nothing to do.
5003     auto *I = cast<Instruction>(Ptr);
5004     if (Worklist.count(I))
5005       return;
5006 
5007     // If the use of the pointer will be a scalar use, and all users of the
5008     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5009     // place the pointer in PossibleNonScalarPtrs.
5010     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5011           return isa<LoadInst>(U) || isa<StoreInst>(U);
5012         }))
5013       ScalarPtrs.insert(I);
5014     else
5015       PossibleNonScalarPtrs.insert(I);
5016   };
5017 
5018   // We seed the scalars analysis with three classes of instructions: (1)
5019   // instructions marked uniform-after-vectorization and (2) bitcast,
5020   // getelementptr and (pointer) phi instructions used by memory accesses
5021   // requiring a scalar use.
5022   //
5023   // (1) Add to the worklist all instructions that have been identified as
5024   // uniform-after-vectorization.
5025   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5026 
5027   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5028   // memory accesses requiring a scalar use. The pointer operands of loads and
5029   // stores will be scalar as long as the memory accesses is not a gather or
5030   // scatter operation. The value operand of a store will remain scalar if the
5031   // store is scalarized.
5032   for (auto *BB : TheLoop->blocks())
5033     for (auto &I : *BB) {
5034       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5035         evaluatePtrUse(Load, Load->getPointerOperand());
5036       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5037         evaluatePtrUse(Store, Store->getPointerOperand());
5038         evaluatePtrUse(Store, Store->getValueOperand());
5039       }
5040     }
5041   for (auto *I : ScalarPtrs)
5042     if (!PossibleNonScalarPtrs.count(I)) {
5043       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5044       Worklist.insert(I);
5045     }
5046 
5047   // Insert the forced scalars.
5048   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5049   // induction variable when the PHI user is scalarized.
5050   auto ForcedScalar = ForcedScalars.find(VF);
5051   if (ForcedScalar != ForcedScalars.end())
5052     for (auto *I : ForcedScalar->second)
5053       Worklist.insert(I);
5054 
5055   // Expand the worklist by looking through any bitcasts and getelementptr
5056   // instructions we've already identified as scalar. This is similar to the
5057   // expansion step in collectLoopUniforms(); however, here we're only
5058   // expanding to include additional bitcasts and getelementptr instructions.
5059   unsigned Idx = 0;
5060   while (Idx != Worklist.size()) {
5061     Instruction *Dst = Worklist[Idx++];
5062     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5063       continue;
5064     auto *Src = cast<Instruction>(Dst->getOperand(0));
5065     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5066           auto *J = cast<Instruction>(U);
5067           return !TheLoop->contains(J) || Worklist.count(J) ||
5068                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5069                   isScalarUse(J, Src));
5070         })) {
5071       Worklist.insert(Src);
5072       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5073     }
5074   }
5075 
5076   // An induction variable will remain scalar if all users of the induction
5077   // variable and induction variable update remain scalar.
5078   for (auto &Induction : Legal->getInductionVars()) {
5079     auto *Ind = Induction.first;
5080     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5081 
5082     // If tail-folding is applied, the primary induction variable will be used
5083     // to feed a vector compare.
5084     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5085       continue;
5086 
5087     // Determine if all users of the induction variable are scalar after
5088     // vectorization.
5089     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5090       auto *I = cast<Instruction>(U);
5091       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5092     });
5093     if (!ScalarInd)
5094       continue;
5095 
5096     // Determine if all users of the induction variable update instruction are
5097     // scalar after vectorization.
5098     auto ScalarIndUpdate =
5099         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5100           auto *I = cast<Instruction>(U);
5101           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5102         });
5103     if (!ScalarIndUpdate)
5104       continue;
5105 
5106     // The induction variable and its update instruction will remain scalar.
5107     Worklist.insert(Ind);
5108     Worklist.insert(IndUpdate);
5109     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5110     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5111                       << "\n");
5112   }
5113 
5114   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5115 }
5116 
5117 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
5118                                                          ElementCount VF) {
5119   if (!blockNeedsPredication(I->getParent()))
5120     return false;
5121   switch(I->getOpcode()) {
5122   default:
5123     break;
5124   case Instruction::Load:
5125   case Instruction::Store: {
5126     if (!Legal->isMaskRequired(I))
5127       return false;
5128     auto *Ptr = getLoadStorePointerOperand(I);
5129     auto *Ty = getMemInstValueType(I);
5130     // We have already decided how to vectorize this instruction, get that
5131     // result.
5132     if (VF.isVector()) {
5133       InstWidening WideningDecision = getWideningDecision(I, VF);
5134       assert(WideningDecision != CM_Unknown &&
5135              "Widening decision should be ready at this moment");
5136       return WideningDecision == CM_Scalarize;
5137     }
5138     const Align Alignment = getLoadStoreAlignment(I);
5139     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5140                                 isLegalMaskedGather(Ty, Alignment))
5141                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5142                                 isLegalMaskedScatter(Ty, Alignment));
5143   }
5144   case Instruction::UDiv:
5145   case Instruction::SDiv:
5146   case Instruction::SRem:
5147   case Instruction::URem:
5148     return mayDivideByZero(*I);
5149   }
5150   return false;
5151 }
5152 
5153 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5154     Instruction *I, ElementCount VF) {
5155   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5156   assert(getWideningDecision(I, VF) == CM_Unknown &&
5157          "Decision should not be set yet.");
5158   auto *Group = getInterleavedAccessGroup(I);
5159   assert(Group && "Must have a group.");
5160 
5161   // If the instruction's allocated size doesn't equal it's type size, it
5162   // requires padding and will be scalarized.
5163   auto &DL = I->getModule()->getDataLayout();
5164   auto *ScalarTy = getMemInstValueType(I);
5165   if (hasIrregularType(ScalarTy, DL, VF))
5166     return false;
5167 
5168   // Check if masking is required.
5169   // A Group may need masking for one of two reasons: it resides in a block that
5170   // needs predication, or it was decided to use masking to deal with gaps.
5171   bool PredicatedAccessRequiresMasking =
5172       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5173   bool AccessWithGapsRequiresMasking =
5174       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5175   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5176     return true;
5177 
5178   // If masked interleaving is required, we expect that the user/target had
5179   // enabled it, because otherwise it either wouldn't have been created or
5180   // it should have been invalidated by the CostModel.
5181   assert(useMaskedInterleavedAccesses(TTI) &&
5182          "Masked interleave-groups for predicated accesses are not enabled.");
5183 
5184   auto *Ty = getMemInstValueType(I);
5185   const Align Alignment = getLoadStoreAlignment(I);
5186   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5187                           : TTI.isLegalMaskedStore(Ty, Alignment);
5188 }
5189 
5190 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5191     Instruction *I, ElementCount VF) {
5192   // Get and ensure we have a valid memory instruction.
5193   LoadInst *LI = dyn_cast<LoadInst>(I);
5194   StoreInst *SI = dyn_cast<StoreInst>(I);
5195   assert((LI || SI) && "Invalid memory instruction");
5196 
5197   auto *Ptr = getLoadStorePointerOperand(I);
5198 
5199   // In order to be widened, the pointer should be consecutive, first of all.
5200   if (!Legal->isConsecutivePtr(Ptr))
5201     return false;
5202 
5203   // If the instruction is a store located in a predicated block, it will be
5204   // scalarized.
5205   if (isScalarWithPredication(I))
5206     return false;
5207 
5208   // If the instruction's allocated size doesn't equal it's type size, it
5209   // requires padding and will be scalarized.
5210   auto &DL = I->getModule()->getDataLayout();
5211   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5212   if (hasIrregularType(ScalarTy, DL, VF))
5213     return false;
5214 
5215   return true;
5216 }
5217 
5218 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5219   // We should not collect Uniforms more than once per VF. Right now,
5220   // this function is called from collectUniformsAndScalars(), which
5221   // already does this check. Collecting Uniforms for VF=1 does not make any
5222   // sense.
5223 
5224   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5225          "This function should not be visited twice for the same VF");
5226 
5227   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5228   // not analyze again.  Uniforms.count(VF) will return 1.
5229   Uniforms[VF].clear();
5230 
5231   // We now know that the loop is vectorizable!
5232   // Collect instructions inside the loop that will remain uniform after
5233   // vectorization.
5234 
5235   // Global values, params and instructions outside of current loop are out of
5236   // scope.
5237   auto isOutOfScope = [&](Value *V) -> bool {
5238     Instruction *I = dyn_cast<Instruction>(V);
5239     return (!I || !TheLoop->contains(I));
5240   };
5241 
5242   SetVector<Instruction *> Worklist;
5243   BasicBlock *Latch = TheLoop->getLoopLatch();
5244 
5245   // Instructions that are scalar with predication must not be considered
5246   // uniform after vectorization, because that would create an erroneous
5247   // replicating region where only a single instance out of VF should be formed.
5248   // TODO: optimize such seldom cases if found important, see PR40816.
5249   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5250     if (isOutOfScope(I)) {
5251       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5252                         << *I << "\n");
5253       return;
5254     }
5255     if (isScalarWithPredication(I, VF)) {
5256       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5257                         << *I << "\n");
5258       return;
5259     }
5260     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5261     Worklist.insert(I);
5262   };
5263 
5264   // Start with the conditional branch. If the branch condition is an
5265   // instruction contained in the loop that is only used by the branch, it is
5266   // uniform.
5267   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5268   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5269     addToWorklistIfAllowed(Cmp);
5270 
5271   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5272     InstWidening WideningDecision = getWideningDecision(I, VF);
5273     assert(WideningDecision != CM_Unknown &&
5274            "Widening decision should be ready at this moment");
5275 
5276     // A uniform memory op is itself uniform.  We exclude uniform stores
5277     // here as they demand the last lane, not the first one.
5278     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5279       assert(WideningDecision == CM_Scalarize);
5280       return true;
5281     }
5282 
5283     return (WideningDecision == CM_Widen ||
5284             WideningDecision == CM_Widen_Reverse ||
5285             WideningDecision == CM_Interleave);
5286   };
5287 
5288 
5289   // Returns true if Ptr is the pointer operand of a memory access instruction
5290   // I, and I is known to not require scalarization.
5291   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5292     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5293   };
5294 
5295   // Holds a list of values which are known to have at least one uniform use.
5296   // Note that there may be other uses which aren't uniform.  A "uniform use"
5297   // here is something which only demands lane 0 of the unrolled iterations;
5298   // it does not imply that all lanes produce the same value (e.g. this is not
5299   // the usual meaning of uniform)
5300   SmallPtrSet<Value *, 8> HasUniformUse;
5301 
5302   // Scan the loop for instructions which are either a) known to have only
5303   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5304   for (auto *BB : TheLoop->blocks())
5305     for (auto &I : *BB) {
5306       // If there's no pointer operand, there's nothing to do.
5307       auto *Ptr = getLoadStorePointerOperand(&I);
5308       if (!Ptr)
5309         continue;
5310 
5311       // A uniform memory op is itself uniform.  We exclude uniform stores
5312       // here as they demand the last lane, not the first one.
5313       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5314         addToWorklistIfAllowed(&I);
5315 
5316       if (isUniformDecision(&I, VF)) {
5317         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5318         HasUniformUse.insert(Ptr);
5319       }
5320     }
5321 
5322   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5323   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5324   // disallows uses outside the loop as well.
5325   for (auto *V : HasUniformUse) {
5326     if (isOutOfScope(V))
5327       continue;
5328     auto *I = cast<Instruction>(V);
5329     auto UsersAreMemAccesses =
5330       llvm::all_of(I->users(), [&](User *U) -> bool {
5331         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5332       });
5333     if (UsersAreMemAccesses)
5334       addToWorklistIfAllowed(I);
5335   }
5336 
5337   // Expand Worklist in topological order: whenever a new instruction
5338   // is added , its users should be already inside Worklist.  It ensures
5339   // a uniform instruction will only be used by uniform instructions.
5340   unsigned idx = 0;
5341   while (idx != Worklist.size()) {
5342     Instruction *I = Worklist[idx++];
5343 
5344     for (auto OV : I->operand_values()) {
5345       // isOutOfScope operands cannot be uniform instructions.
5346       if (isOutOfScope(OV))
5347         continue;
5348       // First order recurrence Phi's should typically be considered
5349       // non-uniform.
5350       auto *OP = dyn_cast<PHINode>(OV);
5351       if (OP && Legal->isFirstOrderRecurrence(OP))
5352         continue;
5353       // If all the users of the operand are uniform, then add the
5354       // operand into the uniform worklist.
5355       auto *OI = cast<Instruction>(OV);
5356       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5357             auto *J = cast<Instruction>(U);
5358             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5359           }))
5360         addToWorklistIfAllowed(OI);
5361     }
5362   }
5363 
5364   // For an instruction to be added into Worklist above, all its users inside
5365   // the loop should also be in Worklist. However, this condition cannot be
5366   // true for phi nodes that form a cyclic dependence. We must process phi
5367   // nodes separately. An induction variable will remain uniform if all users
5368   // of the induction variable and induction variable update remain uniform.
5369   // The code below handles both pointer and non-pointer induction variables.
5370   for (auto &Induction : Legal->getInductionVars()) {
5371     auto *Ind = Induction.first;
5372     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5373 
5374     // Determine if all users of the induction variable are uniform after
5375     // vectorization.
5376     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5377       auto *I = cast<Instruction>(U);
5378       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5379              isVectorizedMemAccessUse(I, Ind);
5380     });
5381     if (!UniformInd)
5382       continue;
5383 
5384     // Determine if all users of the induction variable update instruction are
5385     // uniform after vectorization.
5386     auto UniformIndUpdate =
5387         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5388           auto *I = cast<Instruction>(U);
5389           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5390                  isVectorizedMemAccessUse(I, IndUpdate);
5391         });
5392     if (!UniformIndUpdate)
5393       continue;
5394 
5395     // The induction variable and its update instruction will remain uniform.
5396     addToWorklistIfAllowed(Ind);
5397     addToWorklistIfAllowed(IndUpdate);
5398   }
5399 
5400   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5401 }
5402 
5403 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5404   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5405 
5406   if (Legal->getRuntimePointerChecking()->Need) {
5407     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5408         "runtime pointer checks needed. Enable vectorization of this "
5409         "loop with '#pragma clang loop vectorize(enable)' when "
5410         "compiling with -Os/-Oz",
5411         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5412     return true;
5413   }
5414 
5415   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5416     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5417         "runtime SCEV checks needed. Enable vectorization of this "
5418         "loop with '#pragma clang loop vectorize(enable)' when "
5419         "compiling with -Os/-Oz",
5420         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5421     return true;
5422   }
5423 
5424   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5425   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5426     reportVectorizationFailure("Runtime stride check for small trip count",
5427         "runtime stride == 1 checks needed. Enable vectorization of "
5428         "this loop without such check by compiling with -Os/-Oz",
5429         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5430     return true;
5431   }
5432 
5433   return false;
5434 }
5435 
5436 Optional<ElementCount>
5437 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5438   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5439     // TODO: It may by useful to do since it's still likely to be dynamically
5440     // uniform if the target can skip.
5441     reportVectorizationFailure(
5442         "Not inserting runtime ptr check for divergent target",
5443         "runtime pointer checks needed. Not enabled for divergent target",
5444         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5445     return None;
5446   }
5447 
5448   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5449   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5450   if (TC == 1) {
5451     reportVectorizationFailure("Single iteration (non) loop",
5452         "loop trip count is one, irrelevant for vectorization",
5453         "SingleIterationLoop", ORE, TheLoop);
5454     return None;
5455   }
5456 
5457   ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5458 
5459   switch (ScalarEpilogueStatus) {
5460   case CM_ScalarEpilogueAllowed:
5461     return MaxVF;
5462   case CM_ScalarEpilogueNotAllowedUsePredicate:
5463     LLVM_FALLTHROUGH;
5464   case CM_ScalarEpilogueNotNeededUsePredicate:
5465     LLVM_DEBUG(
5466         dbgs() << "LV: vector predicate hint/switch found.\n"
5467                << "LV: Not allowing scalar epilogue, creating predicated "
5468                << "vector loop.\n");
5469     break;
5470   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5471     // fallthrough as a special case of OptForSize
5472   case CM_ScalarEpilogueNotAllowedOptSize:
5473     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5474       LLVM_DEBUG(
5475           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5476     else
5477       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5478                         << "count.\n");
5479 
5480     // Bail if runtime checks are required, which are not good when optimising
5481     // for size.
5482     if (runtimeChecksRequired())
5483       return None;
5484     break;
5485   }
5486 
5487   // Now try the tail folding
5488 
5489   // Invalidate interleave groups that require an epilogue if we can't mask
5490   // the interleave-group.
5491   if (!useMaskedInterleavedAccesses(TTI)) {
5492     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5493            "No decisions should have been taken at this point");
5494     // Note: There is no need to invalidate any cost modeling decisions here, as
5495     // non where taken so far.
5496     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5497   }
5498 
5499   assert(!MaxVF.isScalable() &&
5500          "Scalable vectors do not yet support tail folding");
5501   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5502          "MaxVF must be a power of 2");
5503   unsigned MaxVFtimesIC =
5504       UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5505   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5506     // Accept MaxVF if we do not have a tail.
5507     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5508     return MaxVF;
5509   }
5510 
5511   // If we don't know the precise trip count, or if the trip count that we
5512   // found modulo the vectorization factor is not zero, try to fold the tail
5513   // by masking.
5514   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5515   if (Legal->prepareToFoldTailByMasking()) {
5516     FoldTailByMasking = true;
5517     return MaxVF;
5518   }
5519 
5520   // If there was a tail-folding hint/switch, but we can't fold the tail by
5521   // masking, fallback to a vectorization with a scalar epilogue.
5522   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5523     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5524                          "scalar epilogue instead.\n");
5525     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5526     return MaxVF;
5527   }
5528 
5529   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5530     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5531     return None;
5532   }
5533 
5534   if (TC == 0) {
5535     reportVectorizationFailure(
5536         "Unable to calculate the loop count due to complex control flow",
5537         "unable to calculate the loop count due to complex control flow",
5538         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5539     return None;
5540   }
5541 
5542   reportVectorizationFailure(
5543       "Cannot optimize for size and vectorize at the same time.",
5544       "cannot optimize for size and vectorize at the same time. "
5545       "Enable vectorization of this loop with '#pragma clang loop "
5546       "vectorize(enable)' when compiling with -Os/-Oz",
5547       "NoTailLoopWithOptForSize", ORE, TheLoop);
5548   return None;
5549 }
5550 
5551 ElementCount
5552 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5553                                                  ElementCount UserVF) {
5554   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5555   unsigned SmallestType, WidestType;
5556   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5557   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5558 
5559   // Get the maximum safe dependence distance in bits computed by LAA.
5560   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5561   // the memory accesses that is most restrictive (involved in the smallest
5562   // dependence distance).
5563   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5564 
5565   if (UserVF.isNonZero()) {
5566     // For now, don't verify legality of scalable vectors.
5567     // This will be addressed properly in https://reviews.llvm.org/D91718.
5568     if (UserVF.isScalable())
5569       return UserVF;
5570 
5571     // If legally unsafe, clamp the user vectorization factor to a safe value.
5572     unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5573     if (UserVF.getFixedValue() <= MaxSafeVF)
5574       return UserVF;
5575 
5576     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5577                       << " is unsafe, clamping to max safe VF=" << MaxSafeVF
5578                       << ".\n");
5579     ORE->emit([&]() {
5580       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5581                                         TheLoop->getStartLoc(),
5582                                         TheLoop->getHeader())
5583              << "User-specified vectorization factor "
5584              << ore::NV("UserVectorizationFactor", UserVF)
5585              << " is unsafe, clamping to maximum safe vectorization factor "
5586              << ore::NV("VectorizationFactor", MaxSafeVF);
5587     });
5588     return ElementCount::getFixed(MaxSafeVF);
5589   }
5590 
5591   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5592 
5593   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5594   // Note that both WidestRegister and WidestType may not be a powers of 2.
5595   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5596 
5597   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5598                     << " / " << WidestType << " bits.\n");
5599   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5600                     << WidestRegister << " bits.\n");
5601 
5602   assert(MaxVectorSize <= WidestRegister &&
5603          "Did not expect to pack so many elements"
5604          " into one vector!");
5605   if (MaxVectorSize == 0) {
5606     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5607     MaxVectorSize = 1;
5608     return ElementCount::getFixed(MaxVectorSize);
5609   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5610              isPowerOf2_32(ConstTripCount)) {
5611     // We need to clamp the VF to be the ConstTripCount. There is no point in
5612     // choosing a higher viable VF as done in the loop below.
5613     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5614                       << ConstTripCount << "\n");
5615     MaxVectorSize = ConstTripCount;
5616     return ElementCount::getFixed(MaxVectorSize);
5617   }
5618 
5619   unsigned MaxVF = MaxVectorSize;
5620   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5621       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5622     // Collect all viable vectorization factors larger than the default MaxVF
5623     // (i.e. MaxVectorSize).
5624     SmallVector<ElementCount, 8> VFs;
5625     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5626     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5627       VFs.push_back(ElementCount::getFixed(VS));
5628 
5629     // For each VF calculate its register usage.
5630     auto RUs = calculateRegisterUsage(VFs);
5631 
5632     // Select the largest VF which doesn't require more registers than existing
5633     // ones.
5634     for (int i = RUs.size() - 1; i >= 0; --i) {
5635       bool Selected = true;
5636       for (auto& pair : RUs[i].MaxLocalUsers) {
5637         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5638         if (pair.second > TargetNumRegisters)
5639           Selected = false;
5640       }
5641       if (Selected) {
5642         MaxVF = VFs[i].getKnownMinValue();
5643         break;
5644       }
5645     }
5646     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5647       if (MaxVF < MinVF) {
5648         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5649                           << ") with target's minimum: " << MinVF << '\n');
5650         MaxVF = MinVF;
5651       }
5652     }
5653   }
5654   return ElementCount::getFixed(MaxVF);
5655 }
5656 
5657 VectorizationFactor
5658 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5659   // FIXME: This can be fixed for scalable vectors later, because at this stage
5660   // the LoopVectorizer will only consider vectorizing a loop with scalable
5661   // vectors when the loop has a hint to enable vectorization for a given VF.
5662   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5663 
5664   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5665   const float ScalarCost = Cost;
5666   unsigned Width = 1;
5667   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5668 
5669   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5670   if (ForceVectorization && MaxVF.isVector()) {
5671     // Ignore scalar width, because the user explicitly wants vectorization.
5672     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5673     // evaluation.
5674     Cost = std::numeric_limits<float>::max();
5675   }
5676 
5677   for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
5678     // Notice that the vector loop needs to be executed less times, so
5679     // we need to divide the cost of the vector loops by the width of
5680     // the vector elements.
5681     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5682     float VectorCost = C.first / (float)i;
5683     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5684                       << " costs: " << (int)VectorCost << ".\n");
5685     if (!C.second && !ForceVectorization) {
5686       LLVM_DEBUG(
5687           dbgs() << "LV: Not considering vector loop of width " << i
5688                  << " because it will not generate any vector instructions.\n");
5689       continue;
5690     }
5691 
5692     // If profitable add it to ProfitableVF list.
5693     if (VectorCost < ScalarCost) {
5694       ProfitableVFs.push_back(VectorizationFactor(
5695           {ElementCount::getFixed(i), (unsigned)VectorCost}));
5696     }
5697 
5698     if (VectorCost < Cost) {
5699       Cost = VectorCost;
5700       Width = i;
5701     }
5702   }
5703 
5704   if (!EnableCondStoresVectorization && NumPredStores) {
5705     reportVectorizationFailure("There are conditional stores.",
5706         "store that is conditionally executed prevents vectorization",
5707         "ConditionalStore", ORE, TheLoop);
5708     Width = 1;
5709     Cost = ScalarCost;
5710   }
5711 
5712   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5713              << "LV: Vectorization seems to be not beneficial, "
5714              << "but was forced by a user.\n");
5715   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5716   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5717                                 (unsigned)(Width * Cost)};
5718   return Factor;
5719 }
5720 
5721 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5722     const Loop &L, ElementCount VF) const {
5723   // Cross iteration phis such as reductions need special handling and are
5724   // currently unsupported.
5725   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5726         return Legal->isFirstOrderRecurrence(&Phi) ||
5727                Legal->isReductionVariable(&Phi);
5728       }))
5729     return false;
5730 
5731   // Phis with uses outside of the loop require special handling and are
5732   // currently unsupported.
5733   for (auto &Entry : Legal->getInductionVars()) {
5734     // Look for uses of the value of the induction at the last iteration.
5735     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5736     for (User *U : PostInc->users())
5737       if (!L.contains(cast<Instruction>(U)))
5738         return false;
5739     // Look for uses of penultimate value of the induction.
5740     for (User *U : Entry.first->users())
5741       if (!L.contains(cast<Instruction>(U)))
5742         return false;
5743   }
5744 
5745   // Induction variables that are widened require special handling that is
5746   // currently not supported.
5747   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5748         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5749                  this->isProfitableToScalarize(Entry.first, VF));
5750       }))
5751     return false;
5752 
5753   return true;
5754 }
5755 
5756 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5757     const ElementCount VF) const {
5758   // FIXME: We need a much better cost-model to take different parameters such
5759   // as register pressure, code size increase and cost of extra branches into
5760   // account. For now we apply a very crude heuristic and only consider loops
5761   // with vectorization factors larger than a certain value.
5762   // We also consider epilogue vectorization unprofitable for targets that don't
5763   // consider interleaving beneficial (eg. MVE).
5764   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5765     return false;
5766   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5767     return true;
5768   return false;
5769 }
5770 
5771 VectorizationFactor
5772 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5773     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5774   VectorizationFactor Result = VectorizationFactor::Disabled();
5775   if (!EnableEpilogueVectorization) {
5776     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5777     return Result;
5778   }
5779 
5780   if (!isScalarEpilogueAllowed()) {
5781     LLVM_DEBUG(
5782         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5783                   "allowed.\n";);
5784     return Result;
5785   }
5786 
5787   // Not really a cost consideration, but check for unsupported cases here to
5788   // simplify the logic.
5789   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5790     LLVM_DEBUG(
5791         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5792                   "not a supported candidate.\n";);
5793     return Result;
5794   }
5795 
5796   if (EpilogueVectorizationForceVF > 1) {
5797     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5798     if (LVP.hasPlanWithVFs(
5799             {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
5800       return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
5801     else {
5802       LLVM_DEBUG(
5803           dbgs()
5804               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5805       return Result;
5806     }
5807   }
5808 
5809   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5810       TheLoop->getHeader()->getParent()->hasMinSize()) {
5811     LLVM_DEBUG(
5812         dbgs()
5813             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5814     return Result;
5815   }
5816 
5817   if (!isEpilogueVectorizationProfitable(MainLoopVF))
5818     return Result;
5819 
5820   for (auto &NextVF : ProfitableVFs)
5821     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
5822         (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
5823         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
5824       Result = NextVF;
5825 
5826   if (Result != VectorizationFactor::Disabled())
5827     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5828                       << Result.Width.getFixedValue() << "\n";);
5829   return Result;
5830 }
5831 
5832 std::pair<unsigned, unsigned>
5833 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5834   unsigned MinWidth = -1U;
5835   unsigned MaxWidth = 8;
5836   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5837 
5838   // For each block.
5839   for (BasicBlock *BB : TheLoop->blocks()) {
5840     // For each instruction in the loop.
5841     for (Instruction &I : BB->instructionsWithoutDebug()) {
5842       Type *T = I.getType();
5843 
5844       // Skip ignored values.
5845       if (ValuesToIgnore.count(&I))
5846         continue;
5847 
5848       // Only examine Loads, Stores and PHINodes.
5849       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5850         continue;
5851 
5852       // Examine PHI nodes that are reduction variables. Update the type to
5853       // account for the recurrence type.
5854       if (auto *PN = dyn_cast<PHINode>(&I)) {
5855         if (!Legal->isReductionVariable(PN))
5856           continue;
5857         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5858         T = RdxDesc.getRecurrenceType();
5859       }
5860 
5861       // Examine the stored values.
5862       if (auto *ST = dyn_cast<StoreInst>(&I))
5863         T = ST->getValueOperand()->getType();
5864 
5865       // Ignore loaded pointer types and stored pointer types that are not
5866       // vectorizable.
5867       //
5868       // FIXME: The check here attempts to predict whether a load or store will
5869       //        be vectorized. We only know this for certain after a VF has
5870       //        been selected. Here, we assume that if an access can be
5871       //        vectorized, it will be. We should also look at extending this
5872       //        optimization to non-pointer types.
5873       //
5874       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5875           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5876         continue;
5877 
5878       MinWidth = std::min(MinWidth,
5879                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5880       MaxWidth = std::max(MaxWidth,
5881                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5882     }
5883   }
5884 
5885   return {MinWidth, MaxWidth};
5886 }
5887 
5888 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5889                                                            unsigned LoopCost) {
5890   // -- The interleave heuristics --
5891   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5892   // There are many micro-architectural considerations that we can't predict
5893   // at this level. For example, frontend pressure (on decode or fetch) due to
5894   // code size, or the number and capabilities of the execution ports.
5895   //
5896   // We use the following heuristics to select the interleave count:
5897   // 1. If the code has reductions, then we interleave to break the cross
5898   // iteration dependency.
5899   // 2. If the loop is really small, then we interleave to reduce the loop
5900   // overhead.
5901   // 3. We don't interleave if we think that we will spill registers to memory
5902   // due to the increased register pressure.
5903 
5904   if (!isScalarEpilogueAllowed())
5905     return 1;
5906 
5907   // We used the distance for the interleave count.
5908   if (Legal->getMaxSafeDepDistBytes() != -1U)
5909     return 1;
5910 
5911   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5912   const bool HasReductions = !Legal->getReductionVars().empty();
5913   // Do not interleave loops with a relatively small known or estimated trip
5914   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5915   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5916   // because with the above conditions interleaving can expose ILP and break
5917   // cross iteration dependences for reductions.
5918   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5919       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5920     return 1;
5921 
5922   RegisterUsage R = calculateRegisterUsage({VF})[0];
5923   // We divide by these constants so assume that we have at least one
5924   // instruction that uses at least one register.
5925   for (auto& pair : R.MaxLocalUsers) {
5926     pair.second = std::max(pair.second, 1U);
5927   }
5928 
5929   // We calculate the interleave count using the following formula.
5930   // Subtract the number of loop invariants from the number of available
5931   // registers. These registers are used by all of the interleaved instances.
5932   // Next, divide the remaining registers by the number of registers that is
5933   // required by the loop, in order to estimate how many parallel instances
5934   // fit without causing spills. All of this is rounded down if necessary to be
5935   // a power of two. We want power of two interleave count to simplify any
5936   // addressing operations or alignment considerations.
5937   // We also want power of two interleave counts to ensure that the induction
5938   // variable of the vector loop wraps to zero, when tail is folded by masking;
5939   // this currently happens when OptForSize, in which case IC is set to 1 above.
5940   unsigned IC = UINT_MAX;
5941 
5942   for (auto& pair : R.MaxLocalUsers) {
5943     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5944     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5945                       << " registers of "
5946                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5947     if (VF.isScalar()) {
5948       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5949         TargetNumRegisters = ForceTargetNumScalarRegs;
5950     } else {
5951       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5952         TargetNumRegisters = ForceTargetNumVectorRegs;
5953     }
5954     unsigned MaxLocalUsers = pair.second;
5955     unsigned LoopInvariantRegs = 0;
5956     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5957       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5958 
5959     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5960     // Don't count the induction variable as interleaved.
5961     if (EnableIndVarRegisterHeur) {
5962       TmpIC =
5963           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5964                         std::max(1U, (MaxLocalUsers - 1)));
5965     }
5966 
5967     IC = std::min(IC, TmpIC);
5968   }
5969 
5970   // Clamp the interleave ranges to reasonable counts.
5971   unsigned MaxInterleaveCount =
5972       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5973 
5974   // Check if the user has overridden the max.
5975   if (VF.isScalar()) {
5976     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5977       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5978   } else {
5979     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5980       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5981   }
5982 
5983   // If trip count is known or estimated compile time constant, limit the
5984   // interleave count to be less than the trip count divided by VF, provided it
5985   // is at least 1.
5986   //
5987   // For scalable vectors we can't know if interleaving is beneficial. It may
5988   // not be beneficial for small loops if none of the lanes in the second vector
5989   // iterations is enabled. However, for larger loops, there is likely to be a
5990   // similar benefit as for fixed-width vectors. For now, we choose to leave
5991   // the InterleaveCount as if vscale is '1', although if some information about
5992   // the vector is known (e.g. min vector size), we can make a better decision.
5993   if (BestKnownTC) {
5994     MaxInterleaveCount =
5995         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5996     // Make sure MaxInterleaveCount is greater than 0.
5997     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5998   }
5999 
6000   assert(MaxInterleaveCount > 0 &&
6001          "Maximum interleave count must be greater than 0");
6002 
6003   // Clamp the calculated IC to be between the 1 and the max interleave count
6004   // that the target and trip count allows.
6005   if (IC > MaxInterleaveCount)
6006     IC = MaxInterleaveCount;
6007   else
6008     // Make sure IC is greater than 0.
6009     IC = std::max(1u, IC);
6010 
6011   assert(IC > 0 && "Interleave count must be greater than 0.");
6012 
6013   // If we did not calculate the cost for VF (because the user selected the VF)
6014   // then we calculate the cost of VF here.
6015   if (LoopCost == 0)
6016     LoopCost = expectedCost(VF).first;
6017 
6018   assert(LoopCost && "Non-zero loop cost expected");
6019 
6020   // Interleave if we vectorized this loop and there is a reduction that could
6021   // benefit from interleaving.
6022   if (VF.isVector() && HasReductions) {
6023     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6024     return IC;
6025   }
6026 
6027   // Note that if we've already vectorized the loop we will have done the
6028   // runtime check and so interleaving won't require further checks.
6029   bool InterleavingRequiresRuntimePointerCheck =
6030       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6031 
6032   // We want to interleave small loops in order to reduce the loop overhead and
6033   // potentially expose ILP opportunities.
6034   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6035                     << "LV: IC is " << IC << '\n'
6036                     << "LV: VF is " << VF << '\n');
6037   const bool AggressivelyInterleaveReductions =
6038       TTI.enableAggressiveInterleaving(HasReductions);
6039   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6040     // We assume that the cost overhead is 1 and we use the cost model
6041     // to estimate the cost of the loop and interleave until the cost of the
6042     // loop overhead is about 5% of the cost of the loop.
6043     unsigned SmallIC =
6044         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6045 
6046     // Interleave until store/load ports (estimated by max interleave count) are
6047     // saturated.
6048     unsigned NumStores = Legal->getNumStores();
6049     unsigned NumLoads = Legal->getNumLoads();
6050     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6051     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6052 
6053     // If we have a scalar reduction (vector reductions are already dealt with
6054     // by this point), we can increase the critical path length if the loop
6055     // we're interleaving is inside another loop. Limit, by default to 2, so the
6056     // critical path only gets increased by one reduction operation.
6057     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6058       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6059       SmallIC = std::min(SmallIC, F);
6060       StoresIC = std::min(StoresIC, F);
6061       LoadsIC = std::min(LoadsIC, F);
6062     }
6063 
6064     if (EnableLoadStoreRuntimeInterleave &&
6065         std::max(StoresIC, LoadsIC) > SmallIC) {
6066       LLVM_DEBUG(
6067           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6068       return std::max(StoresIC, LoadsIC);
6069     }
6070 
6071     // If there are scalar reductions and TTI has enabled aggressive
6072     // interleaving for reductions, we will interleave to expose ILP.
6073     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6074         AggressivelyInterleaveReductions) {
6075       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6076       // Interleave no less than SmallIC but not as aggressive as the normal IC
6077       // to satisfy the rare situation when resources are too limited.
6078       return std::max(IC / 2, SmallIC);
6079     } else {
6080       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6081       return SmallIC;
6082     }
6083   }
6084 
6085   // Interleave if this is a large loop (small loops are already dealt with by
6086   // this point) that could benefit from interleaving.
6087   if (AggressivelyInterleaveReductions) {
6088     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6089     return IC;
6090   }
6091 
6092   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6093   return 1;
6094 }
6095 
6096 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6097 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6098   // This function calculates the register usage by measuring the highest number
6099   // of values that are alive at a single location. Obviously, this is a very
6100   // rough estimation. We scan the loop in a topological order in order and
6101   // assign a number to each instruction. We use RPO to ensure that defs are
6102   // met before their users. We assume that each instruction that has in-loop
6103   // users starts an interval. We record every time that an in-loop value is
6104   // used, so we have a list of the first and last occurrences of each
6105   // instruction. Next, we transpose this data structure into a multi map that
6106   // holds the list of intervals that *end* at a specific location. This multi
6107   // map allows us to perform a linear search. We scan the instructions linearly
6108   // and record each time that a new interval starts, by placing it in a set.
6109   // If we find this value in the multi-map then we remove it from the set.
6110   // The max register usage is the maximum size of the set.
6111   // We also search for instructions that are defined outside the loop, but are
6112   // used inside the loop. We need this number separately from the max-interval
6113   // usage number because when we unroll, loop-invariant values do not take
6114   // more register.
6115   LoopBlocksDFS DFS(TheLoop);
6116   DFS.perform(LI);
6117 
6118   RegisterUsage RU;
6119 
6120   // Each 'key' in the map opens a new interval. The values
6121   // of the map are the index of the 'last seen' usage of the
6122   // instruction that is the key.
6123   using IntervalMap = DenseMap<Instruction *, unsigned>;
6124 
6125   // Maps instruction to its index.
6126   SmallVector<Instruction *, 64> IdxToInstr;
6127   // Marks the end of each interval.
6128   IntervalMap EndPoint;
6129   // Saves the list of instruction indices that are used in the loop.
6130   SmallPtrSet<Instruction *, 8> Ends;
6131   // Saves the list of values that are used in the loop but are
6132   // defined outside the loop, such as arguments and constants.
6133   SmallPtrSet<Value *, 8> LoopInvariants;
6134 
6135   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6136     for (Instruction &I : BB->instructionsWithoutDebug()) {
6137       IdxToInstr.push_back(&I);
6138 
6139       // Save the end location of each USE.
6140       for (Value *U : I.operands()) {
6141         auto *Instr = dyn_cast<Instruction>(U);
6142 
6143         // Ignore non-instruction values such as arguments, constants, etc.
6144         if (!Instr)
6145           continue;
6146 
6147         // If this instruction is outside the loop then record it and continue.
6148         if (!TheLoop->contains(Instr)) {
6149           LoopInvariants.insert(Instr);
6150           continue;
6151         }
6152 
6153         // Overwrite previous end points.
6154         EndPoint[Instr] = IdxToInstr.size();
6155         Ends.insert(Instr);
6156       }
6157     }
6158   }
6159 
6160   // Saves the list of intervals that end with the index in 'key'.
6161   using InstrList = SmallVector<Instruction *, 2>;
6162   DenseMap<unsigned, InstrList> TransposeEnds;
6163 
6164   // Transpose the EndPoints to a list of values that end at each index.
6165   for (auto &Interval : EndPoint)
6166     TransposeEnds[Interval.second].push_back(Interval.first);
6167 
6168   SmallPtrSet<Instruction *, 8> OpenIntervals;
6169   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6170   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6171 
6172   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6173 
6174   // A lambda that gets the register usage for the given type and VF.
6175   const auto &TTICapture = TTI;
6176   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6177     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6178       return 0U;
6179     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6180   };
6181 
6182   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6183     Instruction *I = IdxToInstr[i];
6184 
6185     // Remove all of the instructions that end at this location.
6186     InstrList &List = TransposeEnds[i];
6187     for (Instruction *ToRemove : List)
6188       OpenIntervals.erase(ToRemove);
6189 
6190     // Ignore instructions that are never used within the loop.
6191     if (!Ends.count(I))
6192       continue;
6193 
6194     // Skip ignored values.
6195     if (ValuesToIgnore.count(I))
6196       continue;
6197 
6198     // For each VF find the maximum usage of registers.
6199     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6200       // Count the number of live intervals.
6201       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6202 
6203       if (VFs[j].isScalar()) {
6204         for (auto Inst : OpenIntervals) {
6205           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6206           if (RegUsage.find(ClassID) == RegUsage.end())
6207             RegUsage[ClassID] = 1;
6208           else
6209             RegUsage[ClassID] += 1;
6210         }
6211       } else {
6212         collectUniformsAndScalars(VFs[j]);
6213         for (auto Inst : OpenIntervals) {
6214           // Skip ignored values for VF > 1.
6215           if (VecValuesToIgnore.count(Inst))
6216             continue;
6217           if (isScalarAfterVectorization(Inst, VFs[j])) {
6218             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6219             if (RegUsage.find(ClassID) == RegUsage.end())
6220               RegUsage[ClassID] = 1;
6221             else
6222               RegUsage[ClassID] += 1;
6223           } else {
6224             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6225             if (RegUsage.find(ClassID) == RegUsage.end())
6226               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6227             else
6228               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6229           }
6230         }
6231       }
6232 
6233       for (auto& pair : RegUsage) {
6234         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6235           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6236         else
6237           MaxUsages[j][pair.first] = pair.second;
6238       }
6239     }
6240 
6241     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6242                       << OpenIntervals.size() << '\n');
6243 
6244     // Add the current instruction to the list of open intervals.
6245     OpenIntervals.insert(I);
6246   }
6247 
6248   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6249     SmallMapVector<unsigned, unsigned, 4> Invariant;
6250 
6251     for (auto Inst : LoopInvariants) {
6252       unsigned Usage =
6253           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6254       unsigned ClassID =
6255           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6256       if (Invariant.find(ClassID) == Invariant.end())
6257         Invariant[ClassID] = Usage;
6258       else
6259         Invariant[ClassID] += Usage;
6260     }
6261 
6262     LLVM_DEBUG({
6263       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6264       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6265              << " item\n";
6266       for (const auto &pair : MaxUsages[i]) {
6267         dbgs() << "LV(REG): RegisterClass: "
6268                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6269                << " registers\n";
6270       }
6271       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6272              << " item\n";
6273       for (const auto &pair : Invariant) {
6274         dbgs() << "LV(REG): RegisterClass: "
6275                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6276                << " registers\n";
6277       }
6278     });
6279 
6280     RU.LoopInvariantRegs = Invariant;
6281     RU.MaxLocalUsers = MaxUsages[i];
6282     RUs[i] = RU;
6283   }
6284 
6285   return RUs;
6286 }
6287 
6288 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6289   // TODO: Cost model for emulated masked load/store is completely
6290   // broken. This hack guides the cost model to use an artificially
6291   // high enough value to practically disable vectorization with such
6292   // operations, except where previously deployed legality hack allowed
6293   // using very low cost values. This is to avoid regressions coming simply
6294   // from moving "masked load/store" check from legality to cost model.
6295   // Masked Load/Gather emulation was previously never allowed.
6296   // Limited number of Masked Store/Scatter emulation was allowed.
6297   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
6298   return isa<LoadInst>(I) ||
6299          (isa<StoreInst>(I) &&
6300           NumPredStores > NumberOfStoresToPredicate);
6301 }
6302 
6303 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6304   // If we aren't vectorizing the loop, or if we've already collected the
6305   // instructions to scalarize, there's nothing to do. Collection may already
6306   // have occurred if we have a user-selected VF and are now computing the
6307   // expected cost for interleaving.
6308   if (VF.isScalar() || VF.isZero() ||
6309       InstsToScalarize.find(VF) != InstsToScalarize.end())
6310     return;
6311 
6312   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6313   // not profitable to scalarize any instructions, the presence of VF in the
6314   // map will indicate that we've analyzed it already.
6315   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6316 
6317   // Find all the instructions that are scalar with predication in the loop and
6318   // determine if it would be better to not if-convert the blocks they are in.
6319   // If so, we also record the instructions to scalarize.
6320   for (BasicBlock *BB : TheLoop->blocks()) {
6321     if (!blockNeedsPredication(BB))
6322       continue;
6323     for (Instruction &I : *BB)
6324       if (isScalarWithPredication(&I)) {
6325         ScalarCostsTy ScalarCosts;
6326         // Do not apply discount logic if hacked cost is needed
6327         // for emulated masked memrefs.
6328         if (!useEmulatedMaskMemRefHack(&I) &&
6329             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6330           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6331         // Remember that BB will remain after vectorization.
6332         PredicatedBBsAfterVectorization.insert(BB);
6333       }
6334   }
6335 }
6336 
6337 int LoopVectorizationCostModel::computePredInstDiscount(
6338     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
6339     ElementCount VF) {
6340   assert(!isUniformAfterVectorization(PredInst, VF) &&
6341          "Instruction marked uniform-after-vectorization will be predicated");
6342 
6343   // Initialize the discount to zero, meaning that the scalar version and the
6344   // vector version cost the same.
6345   int Discount = 0;
6346 
6347   // Holds instructions to analyze. The instructions we visit are mapped in
6348   // ScalarCosts. Those instructions are the ones that would be scalarized if
6349   // we find that the scalar version costs less.
6350   SmallVector<Instruction *, 8> Worklist;
6351 
6352   // Returns true if the given instruction can be scalarized.
6353   auto canBeScalarized = [&](Instruction *I) -> bool {
6354     // We only attempt to scalarize instructions forming a single-use chain
6355     // from the original predicated block that would otherwise be vectorized.
6356     // Although not strictly necessary, we give up on instructions we know will
6357     // already be scalar to avoid traversing chains that are unlikely to be
6358     // beneficial.
6359     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6360         isScalarAfterVectorization(I, VF))
6361       return false;
6362 
6363     // If the instruction is scalar with predication, it will be analyzed
6364     // separately. We ignore it within the context of PredInst.
6365     if (isScalarWithPredication(I))
6366       return false;
6367 
6368     // If any of the instruction's operands are uniform after vectorization,
6369     // the instruction cannot be scalarized. This prevents, for example, a
6370     // masked load from being scalarized.
6371     //
6372     // We assume we will only emit a value for lane zero of an instruction
6373     // marked uniform after vectorization, rather than VF identical values.
6374     // Thus, if we scalarize an instruction that uses a uniform, we would
6375     // create uses of values corresponding to the lanes we aren't emitting code
6376     // for. This behavior can be changed by allowing getScalarValue to clone
6377     // the lane zero values for uniforms rather than asserting.
6378     for (Use &U : I->operands())
6379       if (auto *J = dyn_cast<Instruction>(U.get()))
6380         if (isUniformAfterVectorization(J, VF))
6381           return false;
6382 
6383     // Otherwise, we can scalarize the instruction.
6384     return true;
6385   };
6386 
6387   // Compute the expected cost discount from scalarizing the entire expression
6388   // feeding the predicated instruction. We currently only consider expressions
6389   // that are single-use instruction chains.
6390   Worklist.push_back(PredInst);
6391   while (!Worklist.empty()) {
6392     Instruction *I = Worklist.pop_back_val();
6393 
6394     // If we've already analyzed the instruction, there's nothing to do.
6395     if (ScalarCosts.find(I) != ScalarCosts.end())
6396       continue;
6397 
6398     // Compute the cost of the vector instruction. Note that this cost already
6399     // includes the scalarization overhead of the predicated instruction.
6400     unsigned VectorCost = getInstructionCost(I, VF).first;
6401 
6402     // Compute the cost of the scalarized instruction. This cost is the cost of
6403     // the instruction as if it wasn't if-converted and instead remained in the
6404     // predicated block. We will scale this cost by block probability after
6405     // computing the scalarization overhead.
6406     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6407     unsigned ScalarCost =
6408         VF.getKnownMinValue() *
6409         getInstructionCost(I, ElementCount::getFixed(1)).first;
6410 
6411     // Compute the scalarization overhead of needed insertelement instructions
6412     // and phi nodes.
6413     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6414       ScalarCost += TTI.getScalarizationOverhead(
6415           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6416           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6417       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6418       ScalarCost +=
6419           VF.getKnownMinValue() *
6420           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6421     }
6422 
6423     // Compute the scalarization overhead of needed extractelement
6424     // instructions. For each of the instruction's operands, if the operand can
6425     // be scalarized, add it to the worklist; otherwise, account for the
6426     // overhead.
6427     for (Use &U : I->operands())
6428       if (auto *J = dyn_cast<Instruction>(U.get())) {
6429         assert(VectorType::isValidElementType(J->getType()) &&
6430                "Instruction has non-scalar type");
6431         if (canBeScalarized(J))
6432           Worklist.push_back(J);
6433         else if (needsExtract(J, VF)) {
6434           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6435           ScalarCost += TTI.getScalarizationOverhead(
6436               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6437               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6438         }
6439       }
6440 
6441     // Scale the total scalar cost by block probability.
6442     ScalarCost /= getReciprocalPredBlockProb();
6443 
6444     // Compute the discount. A non-negative discount means the vector version
6445     // of the instruction costs more, and scalarizing would be beneficial.
6446     Discount += VectorCost - ScalarCost;
6447     ScalarCosts[I] = ScalarCost;
6448   }
6449 
6450   return Discount;
6451 }
6452 
6453 LoopVectorizationCostModel::VectorizationCostTy
6454 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6455   VectorizationCostTy Cost;
6456 
6457   // For each block.
6458   for (BasicBlock *BB : TheLoop->blocks()) {
6459     VectorizationCostTy BlockCost;
6460 
6461     // For each instruction in the old loop.
6462     for (Instruction &I : BB->instructionsWithoutDebug()) {
6463       // Skip ignored values.
6464       if (ValuesToIgnore.count(&I) ||
6465           (VF.isVector() && VecValuesToIgnore.count(&I)))
6466         continue;
6467 
6468       VectorizationCostTy C = getInstructionCost(&I, VF);
6469 
6470       // Check if we should override the cost.
6471       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6472         C.first = ForceTargetInstructionCost;
6473 
6474       BlockCost.first += C.first;
6475       BlockCost.second |= C.second;
6476       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6477                         << " for VF " << VF << " For instruction: " << I
6478                         << '\n');
6479     }
6480 
6481     // If we are vectorizing a predicated block, it will have been
6482     // if-converted. This means that the block's instructions (aside from
6483     // stores and instructions that may divide by zero) will now be
6484     // unconditionally executed. For the scalar case, we may not always execute
6485     // the predicated block, if it is an if-else block. Thus, scale the block's
6486     // cost by the probability of executing it. blockNeedsPredication from
6487     // Legal is used so as to not include all blocks in tail folded loops.
6488     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6489       BlockCost.first /= getReciprocalPredBlockProb();
6490 
6491     Cost.first += BlockCost.first;
6492     Cost.second |= BlockCost.second;
6493   }
6494 
6495   return Cost;
6496 }
6497 
6498 /// Gets Address Access SCEV after verifying that the access pattern
6499 /// is loop invariant except the induction variable dependence.
6500 ///
6501 /// This SCEV can be sent to the Target in order to estimate the address
6502 /// calculation cost.
6503 static const SCEV *getAddressAccessSCEV(
6504               Value *Ptr,
6505               LoopVectorizationLegality *Legal,
6506               PredicatedScalarEvolution &PSE,
6507               const Loop *TheLoop) {
6508 
6509   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6510   if (!Gep)
6511     return nullptr;
6512 
6513   // We are looking for a gep with all loop invariant indices except for one
6514   // which should be an induction variable.
6515   auto SE = PSE.getSE();
6516   unsigned NumOperands = Gep->getNumOperands();
6517   for (unsigned i = 1; i < NumOperands; ++i) {
6518     Value *Opd = Gep->getOperand(i);
6519     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6520         !Legal->isInductionVariable(Opd))
6521       return nullptr;
6522   }
6523 
6524   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6525   return PSE.getSCEV(Ptr);
6526 }
6527 
6528 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6529   return Legal->hasStride(I->getOperand(0)) ||
6530          Legal->hasStride(I->getOperand(1));
6531 }
6532 
6533 unsigned
6534 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6535                                                         ElementCount VF) {
6536   assert(VF.isVector() &&
6537          "Scalarization cost of instruction implies vectorization.");
6538   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6539   Type *ValTy = getMemInstValueType(I);
6540   auto SE = PSE.getSE();
6541 
6542   unsigned AS = getLoadStoreAddressSpace(I);
6543   Value *Ptr = getLoadStorePointerOperand(I);
6544   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6545 
6546   // Figure out whether the access is strided and get the stride value
6547   // if it's known in compile time
6548   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6549 
6550   // Get the cost of the scalar memory instruction and address computation.
6551   unsigned Cost =
6552       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6553 
6554   // Don't pass *I here, since it is scalar but will actually be part of a
6555   // vectorized loop where the user of it is a vectorized instruction.
6556   const Align Alignment = getLoadStoreAlignment(I);
6557   Cost += VF.getKnownMinValue() *
6558           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6559                               AS, TTI::TCK_RecipThroughput);
6560 
6561   // Get the overhead of the extractelement and insertelement instructions
6562   // we might create due to scalarization.
6563   Cost += getScalarizationOverhead(I, VF);
6564 
6565   // If we have a predicated store, it may not be executed for each vector
6566   // lane. Scale the cost by the probability of executing the predicated
6567   // block.
6568   if (isPredicatedInst(I)) {
6569     Cost /= getReciprocalPredBlockProb();
6570 
6571     if (useEmulatedMaskMemRefHack(I))
6572       // Artificially setting to a high enough value to practically disable
6573       // vectorization with such operations.
6574       Cost = 3000000;
6575   }
6576 
6577   return Cost;
6578 }
6579 
6580 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6581                                                              ElementCount VF) {
6582   Type *ValTy = getMemInstValueType(I);
6583   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6584   Value *Ptr = getLoadStorePointerOperand(I);
6585   unsigned AS = getLoadStoreAddressSpace(I);
6586   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6587   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6588 
6589   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6590          "Stride should be 1 or -1 for consecutive memory access");
6591   const Align Alignment = getLoadStoreAlignment(I);
6592   unsigned Cost = 0;
6593   if (Legal->isMaskRequired(I))
6594     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6595                                       CostKind);
6596   else
6597     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6598                                 CostKind, I);
6599 
6600   bool Reverse = ConsecutiveStride < 0;
6601   if (Reverse)
6602     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6603   return Cost;
6604 }
6605 
6606 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6607                                                          ElementCount VF) {
6608   assert(Legal->isUniformMemOp(*I));
6609 
6610   Type *ValTy = getMemInstValueType(I);
6611   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6612   const Align Alignment = getLoadStoreAlignment(I);
6613   unsigned AS = getLoadStoreAddressSpace(I);
6614   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6615   if (isa<LoadInst>(I)) {
6616     return TTI.getAddressComputationCost(ValTy) +
6617            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6618                                CostKind) +
6619            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6620   }
6621   StoreInst *SI = cast<StoreInst>(I);
6622 
6623   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6624   return TTI.getAddressComputationCost(ValTy) +
6625          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6626                              CostKind) +
6627          (isLoopInvariantStoreValue
6628               ? 0
6629               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6630                                        VF.getKnownMinValue() - 1));
6631 }
6632 
6633 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6634                                                           ElementCount VF) {
6635   Type *ValTy = getMemInstValueType(I);
6636   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6637   const Align Alignment = getLoadStoreAlignment(I);
6638   const Value *Ptr = getLoadStorePointerOperand(I);
6639 
6640   return TTI.getAddressComputationCost(VectorTy) +
6641          TTI.getGatherScatterOpCost(
6642              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6643              TargetTransformInfo::TCK_RecipThroughput, I);
6644 }
6645 
6646 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6647                                                             ElementCount VF) {
6648   Type *ValTy = getMemInstValueType(I);
6649   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6650   unsigned AS = getLoadStoreAddressSpace(I);
6651 
6652   auto Group = getInterleavedAccessGroup(I);
6653   assert(Group && "Fail to get an interleaved access group.");
6654 
6655   unsigned InterleaveFactor = Group->getFactor();
6656   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6657   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6658 
6659   // Holds the indices of existing members in an interleaved load group.
6660   // An interleaved store group doesn't need this as it doesn't allow gaps.
6661   SmallVector<unsigned, 4> Indices;
6662   if (isa<LoadInst>(I)) {
6663     for (unsigned i = 0; i < InterleaveFactor; i++)
6664       if (Group->getMember(i))
6665         Indices.push_back(i);
6666   }
6667 
6668   // Calculate the cost of the whole interleaved group.
6669   bool UseMaskForGaps =
6670       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6671   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6672       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6673       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6674 
6675   if (Group->isReverse()) {
6676     // TODO: Add support for reversed masked interleaved access.
6677     assert(!Legal->isMaskRequired(I) &&
6678            "Reverse masked interleaved access not supported.");
6679     Cost += Group->getNumMembers() *
6680             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6681   }
6682   return Cost;
6683 }
6684 
6685 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6686                                                               ElementCount VF) {
6687   // Calculate scalar cost only. Vectorization cost should be ready at this
6688   // moment.
6689   if (VF.isScalar()) {
6690     Type *ValTy = getMemInstValueType(I);
6691     const Align Alignment = getLoadStoreAlignment(I);
6692     unsigned AS = getLoadStoreAddressSpace(I);
6693 
6694     return TTI.getAddressComputationCost(ValTy) +
6695            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6696                                TTI::TCK_RecipThroughput, I);
6697   }
6698   return getWideningCost(I, VF);
6699 }
6700 
6701 LoopVectorizationCostModel::VectorizationCostTy
6702 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6703                                                ElementCount VF) {
6704   // If we know that this instruction will remain uniform, check the cost of
6705   // the scalar version.
6706   if (isUniformAfterVectorization(I, VF))
6707     VF = ElementCount::getFixed(1);
6708 
6709   if (VF.isVector() && isProfitableToScalarize(I, VF))
6710     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6711 
6712   // Forced scalars do not have any scalarization overhead.
6713   auto ForcedScalar = ForcedScalars.find(VF);
6714   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6715     auto InstSet = ForcedScalar->second;
6716     if (InstSet.count(I))
6717       return VectorizationCostTy(
6718           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6719            VF.getKnownMinValue()),
6720           false);
6721   }
6722 
6723   Type *VectorTy;
6724   unsigned C = getInstructionCost(I, VF, VectorTy);
6725 
6726   bool TypeNotScalarized =
6727       VF.isVector() && VectorTy->isVectorTy() &&
6728       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6729   return VectorizationCostTy(C, TypeNotScalarized);
6730 }
6731 
6732 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6733                                                               ElementCount VF) {
6734 
6735   assert(!VF.isScalable() &&
6736          "cannot compute scalarization overhead for scalable vectorization");
6737   if (VF.isScalar())
6738     return 0;
6739 
6740   unsigned Cost = 0;
6741   Type *RetTy = ToVectorTy(I->getType(), VF);
6742   if (!RetTy->isVoidTy() &&
6743       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6744     Cost += TTI.getScalarizationOverhead(
6745         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6746         true, false);
6747 
6748   // Some targets keep addresses scalar.
6749   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6750     return Cost;
6751 
6752   // Some targets support efficient element stores.
6753   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6754     return Cost;
6755 
6756   // Collect operands to consider.
6757   CallInst *CI = dyn_cast<CallInst>(I);
6758   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6759 
6760   // Skip operands that do not require extraction/scalarization and do not incur
6761   // any overhead.
6762   return Cost + TTI.getOperandsScalarizationOverhead(
6763                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6764 }
6765 
6766 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6767   if (VF.isScalar())
6768     return;
6769   NumPredStores = 0;
6770   for (BasicBlock *BB : TheLoop->blocks()) {
6771     // For each instruction in the old loop.
6772     for (Instruction &I : *BB) {
6773       Value *Ptr =  getLoadStorePointerOperand(&I);
6774       if (!Ptr)
6775         continue;
6776 
6777       // TODO: We should generate better code and update the cost model for
6778       // predicated uniform stores. Today they are treated as any other
6779       // predicated store (see added test cases in
6780       // invariant-store-vectorization.ll).
6781       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6782         NumPredStores++;
6783 
6784       if (Legal->isUniformMemOp(I)) {
6785         // TODO: Avoid replicating loads and stores instead of
6786         // relying on instcombine to remove them.
6787         // Load: Scalar load + broadcast
6788         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6789         unsigned Cost = getUniformMemOpCost(&I, VF);
6790         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6791         continue;
6792       }
6793 
6794       // We assume that widening is the best solution when possible.
6795       if (memoryInstructionCanBeWidened(&I, VF)) {
6796         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6797         int ConsecutiveStride =
6798                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6799         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6800                "Expected consecutive stride.");
6801         InstWidening Decision =
6802             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6803         setWideningDecision(&I, VF, Decision, Cost);
6804         continue;
6805       }
6806 
6807       // Choose between Interleaving, Gather/Scatter or Scalarization.
6808       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6809       unsigned NumAccesses = 1;
6810       if (isAccessInterleaved(&I)) {
6811         auto Group = getInterleavedAccessGroup(&I);
6812         assert(Group && "Fail to get an interleaved access group.");
6813 
6814         // Make one decision for the whole group.
6815         if (getWideningDecision(&I, VF) != CM_Unknown)
6816           continue;
6817 
6818         NumAccesses = Group->getNumMembers();
6819         if (interleavedAccessCanBeWidened(&I, VF))
6820           InterleaveCost = getInterleaveGroupCost(&I, VF);
6821       }
6822 
6823       unsigned GatherScatterCost =
6824           isLegalGatherOrScatter(&I)
6825               ? getGatherScatterCost(&I, VF) * NumAccesses
6826               : std::numeric_limits<unsigned>::max();
6827 
6828       unsigned ScalarizationCost =
6829           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6830 
6831       // Choose better solution for the current VF,
6832       // write down this decision and use it during vectorization.
6833       unsigned Cost;
6834       InstWidening Decision;
6835       if (InterleaveCost <= GatherScatterCost &&
6836           InterleaveCost < ScalarizationCost) {
6837         Decision = CM_Interleave;
6838         Cost = InterleaveCost;
6839       } else if (GatherScatterCost < ScalarizationCost) {
6840         Decision = CM_GatherScatter;
6841         Cost = GatherScatterCost;
6842       } else {
6843         Decision = CM_Scalarize;
6844         Cost = ScalarizationCost;
6845       }
6846       // If the instructions belongs to an interleave group, the whole group
6847       // receives the same decision. The whole group receives the cost, but
6848       // the cost will actually be assigned to one instruction.
6849       if (auto Group = getInterleavedAccessGroup(&I))
6850         setWideningDecision(Group, VF, Decision, Cost);
6851       else
6852         setWideningDecision(&I, VF, Decision, Cost);
6853     }
6854   }
6855 
6856   // Make sure that any load of address and any other address computation
6857   // remains scalar unless there is gather/scatter support. This avoids
6858   // inevitable extracts into address registers, and also has the benefit of
6859   // activating LSR more, since that pass can't optimize vectorized
6860   // addresses.
6861   if (TTI.prefersVectorizedAddressing())
6862     return;
6863 
6864   // Start with all scalar pointer uses.
6865   SmallPtrSet<Instruction *, 8> AddrDefs;
6866   for (BasicBlock *BB : TheLoop->blocks())
6867     for (Instruction &I : *BB) {
6868       Instruction *PtrDef =
6869         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6870       if (PtrDef && TheLoop->contains(PtrDef) &&
6871           getWideningDecision(&I, VF) != CM_GatherScatter)
6872         AddrDefs.insert(PtrDef);
6873     }
6874 
6875   // Add all instructions used to generate the addresses.
6876   SmallVector<Instruction *, 4> Worklist;
6877   for (auto *I : AddrDefs)
6878     Worklist.push_back(I);
6879   while (!Worklist.empty()) {
6880     Instruction *I = Worklist.pop_back_val();
6881     for (auto &Op : I->operands())
6882       if (auto *InstOp = dyn_cast<Instruction>(Op))
6883         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6884             AddrDefs.insert(InstOp).second)
6885           Worklist.push_back(InstOp);
6886   }
6887 
6888   for (auto *I : AddrDefs) {
6889     if (isa<LoadInst>(I)) {
6890       // Setting the desired widening decision should ideally be handled in
6891       // by cost functions, but since this involves the task of finding out
6892       // if the loaded register is involved in an address computation, it is
6893       // instead changed here when we know this is the case.
6894       InstWidening Decision = getWideningDecision(I, VF);
6895       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6896         // Scalarize a widened load of address.
6897         setWideningDecision(
6898             I, VF, CM_Scalarize,
6899             (VF.getKnownMinValue() *
6900              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6901       else if (auto Group = getInterleavedAccessGroup(I)) {
6902         // Scalarize an interleave group of address loads.
6903         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6904           if (Instruction *Member = Group->getMember(I))
6905             setWideningDecision(
6906                 Member, VF, CM_Scalarize,
6907                 (VF.getKnownMinValue() *
6908                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6909         }
6910       }
6911     } else
6912       // Make sure I gets scalarized and a cost estimate without
6913       // scalarization overhead.
6914       ForcedScalars[VF].insert(I);
6915   }
6916 }
6917 
6918 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6919                                                         ElementCount VF,
6920                                                         Type *&VectorTy) {
6921   Type *RetTy = I->getType();
6922   if (canTruncateToMinimalBitwidth(I, VF))
6923     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6924   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6925   auto SE = PSE.getSE();
6926   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6927 
6928   // TODO: We need to estimate the cost of intrinsic calls.
6929   switch (I->getOpcode()) {
6930   case Instruction::GetElementPtr:
6931     // We mark this instruction as zero-cost because the cost of GEPs in
6932     // vectorized code depends on whether the corresponding memory instruction
6933     // is scalarized or not. Therefore, we handle GEPs with the memory
6934     // instruction cost.
6935     return 0;
6936   case Instruction::Br: {
6937     // In cases of scalarized and predicated instructions, there will be VF
6938     // predicated blocks in the vectorized loop. Each branch around these
6939     // blocks requires also an extract of its vector compare i1 element.
6940     bool ScalarPredicatedBB = false;
6941     BranchInst *BI = cast<BranchInst>(I);
6942     if (VF.isVector() && BI->isConditional() &&
6943         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6944          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6945       ScalarPredicatedBB = true;
6946 
6947     if (ScalarPredicatedBB) {
6948       // Return cost for branches around scalarized and predicated blocks.
6949       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6950       auto *Vec_i1Ty =
6951           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6952       return (TTI.getScalarizationOverhead(
6953                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6954                   false, true) +
6955               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
6956                VF.getKnownMinValue()));
6957     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6958       // The back-edge branch will remain, as will all scalar branches.
6959       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6960     else
6961       // This branch will be eliminated by if-conversion.
6962       return 0;
6963     // Note: We currently assume zero cost for an unconditional branch inside
6964     // a predicated block since it will become a fall-through, although we
6965     // may decide in the future to call TTI for all branches.
6966   }
6967   case Instruction::PHI: {
6968     auto *Phi = cast<PHINode>(I);
6969 
6970     // First-order recurrences are replaced by vector shuffles inside the loop.
6971     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6972     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6973       return TTI.getShuffleCost(
6974           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
6975           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
6976 
6977     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6978     // converted into select instructions. We require N - 1 selects per phi
6979     // node, where N is the number of incoming values.
6980     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6981       return (Phi->getNumIncomingValues() - 1) *
6982              TTI.getCmpSelInstrCost(
6983                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6984                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6985                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6986 
6987     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6988   }
6989   case Instruction::UDiv:
6990   case Instruction::SDiv:
6991   case Instruction::URem:
6992   case Instruction::SRem:
6993     // If we have a predicated instruction, it may not be executed for each
6994     // vector lane. Get the scalarization cost and scale this amount by the
6995     // probability of executing the predicated block. If the instruction is not
6996     // predicated, we fall through to the next case.
6997     if (VF.isVector() && isScalarWithPredication(I)) {
6998       unsigned Cost = 0;
6999 
7000       // These instructions have a non-void type, so account for the phi nodes
7001       // that we will create. This cost is likely to be zero. The phi node
7002       // cost, if any, should be scaled by the block probability because it
7003       // models a copy at the end of each predicated block.
7004       Cost += VF.getKnownMinValue() *
7005               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7006 
7007       // The cost of the non-predicated instruction.
7008       Cost += VF.getKnownMinValue() *
7009               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7010 
7011       // The cost of insertelement and extractelement instructions needed for
7012       // scalarization.
7013       Cost += getScalarizationOverhead(I, VF);
7014 
7015       // Scale the cost by the probability of executing the predicated blocks.
7016       // This assumes the predicated block for each vector lane is equally
7017       // likely.
7018       return Cost / getReciprocalPredBlockProb();
7019     }
7020     LLVM_FALLTHROUGH;
7021   case Instruction::Add:
7022   case Instruction::FAdd:
7023   case Instruction::Sub:
7024   case Instruction::FSub:
7025   case Instruction::Mul:
7026   case Instruction::FMul:
7027   case Instruction::FDiv:
7028   case Instruction::FRem:
7029   case Instruction::Shl:
7030   case Instruction::LShr:
7031   case Instruction::AShr:
7032   case Instruction::And:
7033   case Instruction::Or:
7034   case Instruction::Xor: {
7035     // Since we will replace the stride by 1 the multiplication should go away.
7036     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7037       return 0;
7038     // Certain instructions can be cheaper to vectorize if they have a constant
7039     // second vector operand. One example of this are shifts on x86.
7040     Value *Op2 = I->getOperand(1);
7041     TargetTransformInfo::OperandValueProperties Op2VP;
7042     TargetTransformInfo::OperandValueKind Op2VK =
7043         TTI.getOperandInfo(Op2, Op2VP);
7044     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7045       Op2VK = TargetTransformInfo::OK_UniformValue;
7046 
7047     SmallVector<const Value *, 4> Operands(I->operand_values());
7048     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7049     return N * TTI.getArithmeticInstrCost(
7050                    I->getOpcode(), VectorTy, CostKind,
7051                    TargetTransformInfo::OK_AnyValue,
7052                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7053   }
7054   case Instruction::FNeg: {
7055     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7056     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7057     return N * TTI.getArithmeticInstrCost(
7058                    I->getOpcode(), VectorTy, CostKind,
7059                    TargetTransformInfo::OK_AnyValue,
7060                    TargetTransformInfo::OK_AnyValue,
7061                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
7062                    I->getOperand(0), I);
7063   }
7064   case Instruction::Select: {
7065     SelectInst *SI = cast<SelectInst>(I);
7066     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7067     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7068     Type *CondTy = SI->getCondition()->getType();
7069     if (!ScalarCond) {
7070       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7071       CondTy = VectorType::get(CondTy, VF);
7072     }
7073     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7074                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7075   }
7076   case Instruction::ICmp:
7077   case Instruction::FCmp: {
7078     Type *ValTy = I->getOperand(0)->getType();
7079     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7080     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7081       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7082     VectorTy = ToVectorTy(ValTy, VF);
7083     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7084                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7085   }
7086   case Instruction::Store:
7087   case Instruction::Load: {
7088     ElementCount Width = VF;
7089     if (Width.isVector()) {
7090       InstWidening Decision = getWideningDecision(I, Width);
7091       assert(Decision != CM_Unknown &&
7092              "CM decision should be taken at this point");
7093       if (Decision == CM_Scalarize)
7094         Width = ElementCount::getFixed(1);
7095     }
7096     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7097     return getMemoryInstructionCost(I, VF);
7098   }
7099   case Instruction::ZExt:
7100   case Instruction::SExt:
7101   case Instruction::FPToUI:
7102   case Instruction::FPToSI:
7103   case Instruction::FPExt:
7104   case Instruction::PtrToInt:
7105   case Instruction::IntToPtr:
7106   case Instruction::SIToFP:
7107   case Instruction::UIToFP:
7108   case Instruction::Trunc:
7109   case Instruction::FPTrunc:
7110   case Instruction::BitCast: {
7111     // Computes the CastContextHint from a Load/Store instruction.
7112     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7113       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7114              "Expected a load or a store!");
7115 
7116       if (VF.isScalar() || !TheLoop->contains(I))
7117         return TTI::CastContextHint::Normal;
7118 
7119       switch (getWideningDecision(I, VF)) {
7120       case LoopVectorizationCostModel::CM_GatherScatter:
7121         return TTI::CastContextHint::GatherScatter;
7122       case LoopVectorizationCostModel::CM_Interleave:
7123         return TTI::CastContextHint::Interleave;
7124       case LoopVectorizationCostModel::CM_Scalarize:
7125       case LoopVectorizationCostModel::CM_Widen:
7126         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7127                                         : TTI::CastContextHint::Normal;
7128       case LoopVectorizationCostModel::CM_Widen_Reverse:
7129         return TTI::CastContextHint::Reversed;
7130       case LoopVectorizationCostModel::CM_Unknown:
7131         llvm_unreachable("Instr did not go through cost modelling?");
7132       }
7133 
7134       llvm_unreachable("Unhandled case!");
7135     };
7136 
7137     unsigned Opcode = I->getOpcode();
7138     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7139     // For Trunc, the context is the only user, which must be a StoreInst.
7140     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7141       if (I->hasOneUse())
7142         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7143           CCH = ComputeCCH(Store);
7144     }
7145     // For Z/Sext, the context is the operand, which must be a LoadInst.
7146     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7147              Opcode == Instruction::FPExt) {
7148       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7149         CCH = ComputeCCH(Load);
7150     }
7151 
7152     // We optimize the truncation of induction variables having constant
7153     // integer steps. The cost of these truncations is the same as the scalar
7154     // operation.
7155     if (isOptimizableIVTruncate(I, VF)) {
7156       auto *Trunc = cast<TruncInst>(I);
7157       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7158                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7159     }
7160 
7161     Type *SrcScalarTy = I->getOperand(0)->getType();
7162     Type *SrcVecTy =
7163         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7164     if (canTruncateToMinimalBitwidth(I, VF)) {
7165       // This cast is going to be shrunk. This may remove the cast or it might
7166       // turn it into slightly different cast. For example, if MinBW == 16,
7167       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7168       //
7169       // Calculate the modified src and dest types.
7170       Type *MinVecTy = VectorTy;
7171       if (Opcode == Instruction::Trunc) {
7172         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7173         VectorTy =
7174             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7175       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7176         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7177         VectorTy =
7178             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7179       }
7180     }
7181 
7182     assert(!VF.isScalable() && "VF is assumed to be non scalable");
7183     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7184     return N *
7185            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7186   }
7187   case Instruction::Call: {
7188     bool NeedToScalarize;
7189     CallInst *CI = cast<CallInst>(I);
7190     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7191     if (getVectorIntrinsicIDForCall(CI, TLI))
7192       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
7193     return CallCost;
7194   }
7195   case Instruction::ExtractValue: {
7196     InstructionCost ExtractCost =
7197         TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7198     assert(ExtractCost.isValid() && "Invalid cost for ExtractValue");
7199     return *(ExtractCost.getValue());
7200   }
7201   default:
7202     // The cost of executing VF copies of the scalar instruction. This opcode
7203     // is unknown. Assume that it is the same as 'mul'.
7204     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
7205                                        Instruction::Mul, VectorTy, CostKind) +
7206            getScalarizationOverhead(I, VF);
7207   } // end of switch.
7208 }
7209 
7210 char LoopVectorize::ID = 0;
7211 
7212 static const char lv_name[] = "Loop Vectorization";
7213 
7214 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7215 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7216 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7217 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7218 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7219 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7220 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7221 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7222 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7223 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7224 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7225 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7226 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7227 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7228 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7229 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7230 
7231 namespace llvm {
7232 
7233 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7234 
7235 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7236                               bool VectorizeOnlyWhenForced) {
7237   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7238 }
7239 
7240 } // end namespace llvm
7241 
7242 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7243   // Check if the pointer operand of a load or store instruction is
7244   // consecutive.
7245   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7246     return Legal->isConsecutivePtr(Ptr);
7247   return false;
7248 }
7249 
7250 void LoopVectorizationCostModel::collectValuesToIgnore() {
7251   // Ignore ephemeral values.
7252   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7253 
7254   // Ignore type-promoting instructions we identified during reduction
7255   // detection.
7256   for (auto &Reduction : Legal->getReductionVars()) {
7257     RecurrenceDescriptor &RedDes = Reduction.second;
7258     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7259     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7260   }
7261   // Ignore type-casting instructions we identified during induction
7262   // detection.
7263   for (auto &Induction : Legal->getInductionVars()) {
7264     InductionDescriptor &IndDes = Induction.second;
7265     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7266     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7267   }
7268 }
7269 
7270 void LoopVectorizationCostModel::collectInLoopReductions() {
7271   for (auto &Reduction : Legal->getReductionVars()) {
7272     PHINode *Phi = Reduction.first;
7273     RecurrenceDescriptor &RdxDesc = Reduction.second;
7274 
7275     // We don't collect reductions that are type promoted (yet).
7276     if (RdxDesc.getRecurrenceType() != Phi->getType())
7277       continue;
7278 
7279     // If the target would prefer this reduction to happen "in-loop", then we
7280     // want to record it as such.
7281     unsigned Opcode = RdxDesc.getRecurrenceBinOp();
7282     if (!PreferInLoopReductions &&
7283         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7284                                    TargetTransformInfo::ReductionFlags()))
7285       continue;
7286 
7287     // Check that we can correctly put the reductions into the loop, by
7288     // finding the chain of operations that leads from the phi to the loop
7289     // exit value.
7290     SmallVector<Instruction *, 4> ReductionOperations =
7291         RdxDesc.getReductionOpChain(Phi, TheLoop);
7292     bool InLoop = !ReductionOperations.empty();
7293     if (InLoop)
7294       InLoopReductionChains[Phi] = ReductionOperations;
7295     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7296                       << " reduction for phi: " << *Phi << "\n");
7297   }
7298 }
7299 
7300 // TODO: we could return a pair of values that specify the max VF and
7301 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7302 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7303 // doesn't have a cost model that can choose which plan to execute if
7304 // more than one is generated.
7305 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7306                                  LoopVectorizationCostModel &CM) {
7307   unsigned WidestType;
7308   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7309   return WidestVectorRegBits / WidestType;
7310 }
7311 
7312 VectorizationFactor
7313 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7314   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7315   ElementCount VF = UserVF;
7316   // Outer loop handling: They may require CFG and instruction level
7317   // transformations before even evaluating whether vectorization is profitable.
7318   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7319   // the vectorization pipeline.
7320   if (!OrigLoop->isInnermost()) {
7321     // If the user doesn't provide a vectorization factor, determine a
7322     // reasonable one.
7323     if (UserVF.isZero()) {
7324       VF = ElementCount::getFixed(
7325           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
7326       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7327 
7328       // Make sure we have a VF > 1 for stress testing.
7329       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7330         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7331                           << "overriding computed VF.\n");
7332         VF = ElementCount::getFixed(4);
7333       }
7334     }
7335     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7336     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7337            "VF needs to be a power of two");
7338     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7339                       << "VF " << VF << " to build VPlans.\n");
7340     buildVPlans(VF, VF);
7341 
7342     // For VPlan build stress testing, we bail out after VPlan construction.
7343     if (VPlanBuildStressTest)
7344       return VectorizationFactor::Disabled();
7345 
7346     return {VF, 0 /*Cost*/};
7347   }
7348 
7349   LLVM_DEBUG(
7350       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7351                 "VPlan-native path.\n");
7352   return VectorizationFactor::Disabled();
7353 }
7354 
7355 Optional<VectorizationFactor>
7356 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7357   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7358   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7359   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7360     return None;
7361 
7362   // Invalidate interleave groups if all blocks of loop will be predicated.
7363   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7364       !useMaskedInterleavedAccesses(*TTI)) {
7365     LLVM_DEBUG(
7366         dbgs()
7367         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7368            "which requires masked-interleaved support.\n");
7369     if (CM.InterleaveInfo.invalidateGroups())
7370       // Invalidating interleave groups also requires invalidating all decisions
7371       // based on them, which includes widening decisions and uniform and scalar
7372       // values.
7373       CM.invalidateCostModelingDecisions();
7374   }
7375 
7376   ElementCount MaxVF = MaybeMaxVF.getValue();
7377   assert(MaxVF.isNonZero() && "MaxVF is zero.");
7378 
7379   if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) {
7380     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7381     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7382            "VF needs to be a power of two");
7383     // Collect the instructions (and their associated costs) that will be more
7384     // profitable to scalarize.
7385     CM.selectUserVectorizationFactor(UserVF);
7386     CM.collectInLoopReductions();
7387     buildVPlansWithVPRecipes(UserVF, UserVF);
7388     LLVM_DEBUG(printPlans(dbgs()));
7389     return {{UserVF, 0}};
7390   }
7391 
7392   assert(!MaxVF.isScalable() &&
7393          "Scalable vectors not yet supported beyond this point");
7394 
7395   for (ElementCount VF = ElementCount::getFixed(1);
7396        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7397     // Collect Uniform and Scalar instructions after vectorization with VF.
7398     CM.collectUniformsAndScalars(VF);
7399 
7400     // Collect the instructions (and their associated costs) that will be more
7401     // profitable to scalarize.
7402     if (VF.isVector())
7403       CM.collectInstsToScalarize(VF);
7404   }
7405 
7406   CM.collectInLoopReductions();
7407 
7408   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7409   LLVM_DEBUG(printPlans(dbgs()));
7410   if (MaxVF.isScalar())
7411     return VectorizationFactor::Disabled();
7412 
7413   // Select the optimal vectorization factor.
7414   return CM.selectVectorizationFactor(MaxVF);
7415 }
7416 
7417 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7418   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7419                     << '\n');
7420   BestVF = VF;
7421   BestUF = UF;
7422 
7423   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7424     return !Plan->hasVF(VF);
7425   });
7426   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7427 }
7428 
7429 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7430                                            DominatorTree *DT) {
7431   // Perform the actual loop transformation.
7432 
7433   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7434   VPCallbackILV CallbackILV(ILV);
7435 
7436   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7437 
7438   VPTransformState State{*BestVF, BestUF,      LI,
7439                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7440                          &ILV,    CallbackILV};
7441   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7442   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7443   State.CanonicalIV = ILV.Induction;
7444 
7445   ILV.printDebugTracesAtStart();
7446 
7447   //===------------------------------------------------===//
7448   //
7449   // Notice: any optimization or new instruction that go
7450   // into the code below should also be implemented in
7451   // the cost-model.
7452   //
7453   //===------------------------------------------------===//
7454 
7455   // 2. Copy and widen instructions from the old loop into the new loop.
7456   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7457   VPlans.front()->execute(&State);
7458 
7459   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7460   //    predication, updating analyses.
7461   ILV.fixVectorizedLoop();
7462 
7463   ILV.printDebugTracesAtEnd();
7464 }
7465 
7466 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7467     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7468 
7469   // We create new control-flow for the vectorized loop, so the original exit
7470   // conditions will be dead after vectorization if it's only used by the
7471   // terminator
7472   SmallVector<BasicBlock*> ExitingBlocks;
7473   OrigLoop->getExitingBlocks(ExitingBlocks);
7474   for (auto *BB : ExitingBlocks) {
7475     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7476     if (!Cmp || !Cmp->hasOneUse())
7477       continue;
7478 
7479     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7480     if (!DeadInstructions.insert(Cmp).second)
7481       continue;
7482 
7483     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7484     // TODO: can recurse through operands in general
7485     for (Value *Op : Cmp->operands()) {
7486       if (isa<TruncInst>(Op) && Op->hasOneUse())
7487           DeadInstructions.insert(cast<Instruction>(Op));
7488     }
7489   }
7490 
7491   // We create new "steps" for induction variable updates to which the original
7492   // induction variables map. An original update instruction will be dead if
7493   // all its users except the induction variable are dead.
7494   auto *Latch = OrigLoop->getLoopLatch();
7495   for (auto &Induction : Legal->getInductionVars()) {
7496     PHINode *Ind = Induction.first;
7497     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7498 
7499     // If the tail is to be folded by masking, the primary induction variable,
7500     // if exists, isn't dead: it will be used for masking. Don't kill it.
7501     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7502       continue;
7503 
7504     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7505           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7506         }))
7507       DeadInstructions.insert(IndUpdate);
7508 
7509     // We record as "Dead" also the type-casting instructions we had identified
7510     // during induction analysis. We don't need any handling for them in the
7511     // vectorized loop because we have proven that, under a proper runtime
7512     // test guarding the vectorized loop, the value of the phi, and the casted
7513     // value of the phi, are the same. The last instruction in this casting chain
7514     // will get its scalar/vector/widened def from the scalar/vector/widened def
7515     // of the respective phi node. Any other casts in the induction def-use chain
7516     // have no other uses outside the phi update chain, and will be ignored.
7517     InductionDescriptor &IndDes = Induction.second;
7518     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7519     DeadInstructions.insert(Casts.begin(), Casts.end());
7520   }
7521 }
7522 
7523 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7524 
7525 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7526 
7527 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7528                                         Instruction::BinaryOps BinOp) {
7529   // When unrolling and the VF is 1, we only need to add a simple scalar.
7530   Type *Ty = Val->getType();
7531   assert(!Ty->isVectorTy() && "Val must be a scalar");
7532 
7533   if (Ty->isFloatingPointTy()) {
7534     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7535 
7536     // Floating point operations had to be 'fast' to enable the unrolling.
7537     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7538     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7539   }
7540   Constant *C = ConstantInt::get(Ty, StartIdx);
7541   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7542 }
7543 
7544 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7545   SmallVector<Metadata *, 4> MDs;
7546   // Reserve first location for self reference to the LoopID metadata node.
7547   MDs.push_back(nullptr);
7548   bool IsUnrollMetadata = false;
7549   MDNode *LoopID = L->getLoopID();
7550   if (LoopID) {
7551     // First find existing loop unrolling disable metadata.
7552     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7553       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7554       if (MD) {
7555         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7556         IsUnrollMetadata =
7557             S && S->getString().startswith("llvm.loop.unroll.disable");
7558       }
7559       MDs.push_back(LoopID->getOperand(i));
7560     }
7561   }
7562 
7563   if (!IsUnrollMetadata) {
7564     // Add runtime unroll disable metadata.
7565     LLVMContext &Context = L->getHeader()->getContext();
7566     SmallVector<Metadata *, 1> DisableOperands;
7567     DisableOperands.push_back(
7568         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7569     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7570     MDs.push_back(DisableNode);
7571     MDNode *NewLoopID = MDNode::get(Context, MDs);
7572     // Set operand 0 to refer to the loop id itself.
7573     NewLoopID->replaceOperandWith(0, NewLoopID);
7574     L->setLoopID(NewLoopID);
7575   }
7576 }
7577 
7578 //===--------------------------------------------------------------------===//
7579 // EpilogueVectorizerMainLoop
7580 //===--------------------------------------------------------------------===//
7581 
7582 /// This function is partially responsible for generating the control flow
7583 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7584 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7585   MDNode *OrigLoopID = OrigLoop->getLoopID();
7586   Loop *Lp = createVectorLoopSkeleton("");
7587 
7588   // Generate the code to check the minimum iteration count of the vector
7589   // epilogue (see below).
7590   EPI.EpilogueIterationCountCheck =
7591       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
7592   EPI.EpilogueIterationCountCheck->setName("iter.check");
7593 
7594   // Generate the code to check any assumptions that we've made for SCEV
7595   // expressions.
7596   BasicBlock *SavedPreHeader = LoopVectorPreHeader;
7597   emitSCEVChecks(Lp, LoopScalarPreHeader);
7598 
7599   // If a safety check was generated save it.
7600   if (SavedPreHeader != LoopVectorPreHeader)
7601     EPI.SCEVSafetyCheck = SavedPreHeader;
7602 
7603   // Generate the code that checks at runtime if arrays overlap. We put the
7604   // checks into a separate block to make the more common case of few elements
7605   // faster.
7606   SavedPreHeader = LoopVectorPreHeader;
7607   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
7608 
7609   // If a safety check was generated save/overwite it.
7610   if (SavedPreHeader != LoopVectorPreHeader)
7611     EPI.MemSafetyCheck = SavedPreHeader;
7612 
7613   // Generate the iteration count check for the main loop, *after* the check
7614   // for the epilogue loop, so that the path-length is shorter for the case
7615   // that goes directly through the vector epilogue. The longer-path length for
7616   // the main loop is compensated for, by the gain from vectorizing the larger
7617   // trip count. Note: the branch will get updated later on when we vectorize
7618   // the epilogue.
7619   EPI.MainLoopIterationCountCheck =
7620       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
7621 
7622   // Generate the induction variable.
7623   OldInduction = Legal->getPrimaryInduction();
7624   Type *IdxTy = Legal->getWidestInductionType();
7625   Value *StartIdx = ConstantInt::get(IdxTy, 0);
7626   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7627   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7628   EPI.VectorTripCount = CountRoundDown;
7629   Induction =
7630       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7631                               getDebugLocFromInstOrOperands(OldInduction));
7632 
7633   // Skip induction resume value creation here because they will be created in
7634   // the second pass. If we created them here, they wouldn't be used anyway,
7635   // because the vplan in the second pass still contains the inductions from the
7636   // original loop.
7637 
7638   return completeLoopSkeleton(Lp, OrigLoopID);
7639 }
7640 
7641 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7642   LLVM_DEBUG({
7643     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7644            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7645            << ", Main Loop UF:" << EPI.MainLoopUF
7646            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7647            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7648   });
7649 }
7650 
7651 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7652   DEBUG_WITH_TYPE(VerboseDebug, {
7653     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
7654   });
7655 }
7656 
7657 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
7658     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
7659   assert(L && "Expected valid Loop.");
7660   assert(Bypass && "Expected valid bypass basic block.");
7661   unsigned VFactor =
7662       ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
7663   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7664   Value *Count = getOrCreateTripCount(L);
7665   // Reuse existing vector loop preheader for TC checks.
7666   // Note that new preheader block is generated for vector loop.
7667   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7668   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7669 
7670   // Generate code to check if the loop's trip count is less than VF * UF of the
7671   // main vector loop.
7672   auto P =
7673       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7674 
7675   Value *CheckMinIters = Builder.CreateICmp(
7676       P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
7677       "min.iters.check");
7678 
7679   if (!ForEpilogue)
7680     TCCheckBlock->setName("vector.main.loop.iter.check");
7681 
7682   // Create new preheader for vector loop.
7683   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7684                                    DT, LI, nullptr, "vector.ph");
7685 
7686   if (ForEpilogue) {
7687     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7688                                  DT->getNode(Bypass)->getIDom()) &&
7689            "TC check is expected to dominate Bypass");
7690 
7691     // Update dominator for Bypass & LoopExit.
7692     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7693     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7694 
7695     LoopBypassBlocks.push_back(TCCheckBlock);
7696 
7697     // Save the trip count so we don't have to regenerate it in the
7698     // vec.epilog.iter.check. This is safe to do because the trip count
7699     // generated here dominates the vector epilog iter check.
7700     EPI.TripCount = Count;
7701   }
7702 
7703   ReplaceInstWithInst(
7704       TCCheckBlock->getTerminator(),
7705       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7706 
7707   return TCCheckBlock;
7708 }
7709 
7710 //===--------------------------------------------------------------------===//
7711 // EpilogueVectorizerEpilogueLoop
7712 //===--------------------------------------------------------------------===//
7713 
7714 /// This function is partially responsible for generating the control flow
7715 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7716 BasicBlock *
7717 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7718   MDNode *OrigLoopID = OrigLoop->getLoopID();
7719   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
7720 
7721   // Now, compare the remaining count and if there aren't enough iterations to
7722   // execute the vectorized epilogue skip to the scalar part.
7723   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7724   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7725   LoopVectorPreHeader =
7726       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7727                  LI, nullptr, "vec.epilog.ph");
7728   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
7729                                           VecEpilogueIterationCountCheck);
7730 
7731   // Adjust the control flow taking the state info from the main loop
7732   // vectorization into account.
7733   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7734          "expected this to be saved from the previous pass.");
7735   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7736       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7737 
7738   DT->changeImmediateDominator(LoopVectorPreHeader,
7739                                EPI.MainLoopIterationCountCheck);
7740 
7741   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7742       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7743 
7744   if (EPI.SCEVSafetyCheck)
7745     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7746         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7747   if (EPI.MemSafetyCheck)
7748     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7749         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7750 
7751   DT->changeImmediateDominator(
7752       VecEpilogueIterationCountCheck,
7753       VecEpilogueIterationCountCheck->getSinglePredecessor());
7754 
7755   DT->changeImmediateDominator(LoopScalarPreHeader,
7756                                EPI.EpilogueIterationCountCheck);
7757   DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
7758 
7759   // Keep track of bypass blocks, as they feed start values to the induction
7760   // phis in the scalar loop preheader.
7761   if (EPI.SCEVSafetyCheck)
7762     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7763   if (EPI.MemSafetyCheck)
7764     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7765   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7766 
7767   // Generate a resume induction for the vector epilogue and put it in the
7768   // vector epilogue preheader
7769   Type *IdxTy = Legal->getWidestInductionType();
7770   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7771                                          LoopVectorPreHeader->getFirstNonPHI());
7772   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7773   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7774                            EPI.MainLoopIterationCountCheck);
7775 
7776   // Generate the induction variable.
7777   OldInduction = Legal->getPrimaryInduction();
7778   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7779   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7780   Value *StartIdx = EPResumeVal;
7781   Induction =
7782       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7783                               getDebugLocFromInstOrOperands(OldInduction));
7784 
7785   // Generate induction resume values. These variables save the new starting
7786   // indexes for the scalar loop. They are used to test if there are any tail
7787   // iterations left once the vector loop has completed.
7788   // Note that when the vectorized epilogue is skipped due to iteration count
7789   // check, then the resume value for the induction variable comes from
7790   // the trip count of the main vector loop, hence passing the AdditionalBypass
7791   // argument.
7792   createInductionResumeValues(Lp, CountRoundDown,
7793                               {VecEpilogueIterationCountCheck,
7794                                EPI.VectorTripCount} /* AdditionalBypass */);
7795 
7796   AddRuntimeUnrollDisableMetaData(Lp);
7797   return completeLoopSkeleton(Lp, OrigLoopID);
7798 }
7799 
7800 BasicBlock *
7801 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7802     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
7803 
7804   assert(EPI.TripCount &&
7805          "Expected trip count to have been safed in the first pass.");
7806   assert(
7807       (!isa<Instruction>(EPI.TripCount) ||
7808        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7809       "saved trip count does not dominate insertion point.");
7810   Value *TC = EPI.TripCount;
7811   IRBuilder<> Builder(Insert->getTerminator());
7812   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7813 
7814   // Generate code to check if the loop's trip count is less than VF * UF of the
7815   // vector epilogue loop.
7816   auto P =
7817       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7818 
7819   Value *CheckMinIters = Builder.CreateICmp(
7820       P, Count,
7821       ConstantInt::get(Count->getType(),
7822                        EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
7823       "min.epilog.iters.check");
7824 
7825   ReplaceInstWithInst(
7826       Insert->getTerminator(),
7827       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7828 
7829   LoopBypassBlocks.push_back(Insert);
7830   return Insert;
7831 }
7832 
7833 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7834   LLVM_DEBUG({
7835     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7836            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7837            << ", Main Loop UF:" << EPI.MainLoopUF
7838            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7839            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7840   });
7841 }
7842 
7843 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7844   DEBUG_WITH_TYPE(VerboseDebug, {
7845     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
7846   });
7847 }
7848 
7849 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7850     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7851   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7852   bool PredicateAtRangeStart = Predicate(Range.Start);
7853 
7854   for (ElementCount TmpVF = Range.Start * 2;
7855        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7856     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7857       Range.End = TmpVF;
7858       break;
7859     }
7860 
7861   return PredicateAtRangeStart;
7862 }
7863 
7864 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7865 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7866 /// of VF's starting at a given VF and extending it as much as possible. Each
7867 /// vectorization decision can potentially shorten this sub-range during
7868 /// buildVPlan().
7869 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7870                                            ElementCount MaxVF) {
7871   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7872   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7873     VFRange SubRange = {VF, MaxVFPlusOne};
7874     VPlans.push_back(buildVPlan(SubRange));
7875     VF = SubRange.End;
7876   }
7877 }
7878 
7879 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7880                                          VPlanPtr &Plan) {
7881   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7882 
7883   // Look for cached value.
7884   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7885   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7886   if (ECEntryIt != EdgeMaskCache.end())
7887     return ECEntryIt->second;
7888 
7889   VPValue *SrcMask = createBlockInMask(Src, Plan);
7890 
7891   // The terminator has to be a branch inst!
7892   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7893   assert(BI && "Unexpected terminator found");
7894 
7895   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7896     return EdgeMaskCache[Edge] = SrcMask;
7897 
7898   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
7899   assert(EdgeMask && "No Edge Mask found for condition");
7900 
7901   if (BI->getSuccessor(0) != Dst)
7902     EdgeMask = Builder.createNot(EdgeMask);
7903 
7904   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7905     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7906 
7907   return EdgeMaskCache[Edge] = EdgeMask;
7908 }
7909 
7910 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7911   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7912 
7913   // Look for cached value.
7914   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7915   if (BCEntryIt != BlockMaskCache.end())
7916     return BCEntryIt->second;
7917 
7918   // All-one mask is modelled as no-mask following the convention for masked
7919   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7920   VPValue *BlockMask = nullptr;
7921 
7922   if (OrigLoop->getHeader() == BB) {
7923     if (!CM.blockNeedsPredication(BB))
7924       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7925 
7926     // Create the block in mask as the first non-phi instruction in the block.
7927     VPBuilder::InsertPointGuard Guard(Builder);
7928     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
7929     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
7930 
7931     // Introduce the early-exit compare IV <= BTC to form header block mask.
7932     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7933     // Start by constructing the desired canonical IV.
7934     VPValue *IV = nullptr;
7935     if (Legal->getPrimaryInduction())
7936       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
7937     else {
7938       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7939       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
7940       IV = IVRecipe->getVPValue();
7941     }
7942     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7943     bool TailFolded = !CM.isScalarEpilogueAllowed();
7944 
7945     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7946       // While ActiveLaneMask is a binary op that consumes the loop tripcount
7947       // as a second argument, we only pass the IV here and extract the
7948       // tripcount from the transform state where codegen of the VP instructions
7949       // happen.
7950       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7951     } else {
7952       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7953     }
7954     return BlockMaskCache[BB] = BlockMask;
7955   }
7956 
7957   // This is the block mask. We OR all incoming edges.
7958   for (auto *Predecessor : predecessors(BB)) {
7959     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7960     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7961       return BlockMaskCache[BB] = EdgeMask;
7962 
7963     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7964       BlockMask = EdgeMask;
7965       continue;
7966     }
7967 
7968     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7969   }
7970 
7971   return BlockMaskCache[BB] = BlockMask;
7972 }
7973 
7974 VPWidenMemoryInstructionRecipe *
7975 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7976                                   VPlanPtr &Plan) {
7977   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7978          "Must be called with either a load or store");
7979 
7980   auto willWiden = [&](ElementCount VF) -> bool {
7981     if (VF.isScalar())
7982       return false;
7983     LoopVectorizationCostModel::InstWidening Decision =
7984         CM.getWideningDecision(I, VF);
7985     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7986            "CM decision should be taken at this point.");
7987     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7988       return true;
7989     if (CM.isScalarAfterVectorization(I, VF) ||
7990         CM.isProfitableToScalarize(I, VF))
7991       return false;
7992     return Decision != LoopVectorizationCostModel::CM_Scalarize;
7993   };
7994 
7995   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
7996     return nullptr;
7997 
7998   VPValue *Mask = nullptr;
7999   if (Legal->isMaskRequired(I))
8000     Mask = createBlockInMask(I->getParent(), Plan);
8001 
8002   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
8003   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8004     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
8005 
8006   StoreInst *Store = cast<StoreInst>(I);
8007   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
8008   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
8009 }
8010 
8011 VPWidenIntOrFpInductionRecipe *
8012 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
8013   // Check if this is an integer or fp induction. If so, build the recipe that
8014   // produces its scalar and vector values.
8015   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8016   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8017       II.getKind() == InductionDescriptor::IK_FpInduction)
8018     return new VPWidenIntOrFpInductionRecipe(Phi);
8019 
8020   return nullptr;
8021 }
8022 
8023 VPWidenIntOrFpInductionRecipe *
8024 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
8025                                                 VFRange &Range) const {
8026   // Optimize the special case where the source is a constant integer
8027   // induction variable. Notice that we can only optimize the 'trunc' case
8028   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8029   // (c) other casts depend on pointer size.
8030 
8031   // Determine whether \p K is a truncation based on an induction variable that
8032   // can be optimized.
8033   auto isOptimizableIVTruncate =
8034       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8035     return [=](ElementCount VF) -> bool {
8036       return CM.isOptimizableIVTruncate(K, VF);
8037     };
8038   };
8039 
8040   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8041           isOptimizableIVTruncate(I), Range))
8042     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8043                                              I);
8044   return nullptr;
8045 }
8046 
8047 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
8048   // We know that all PHIs in non-header blocks are converted into selects, so
8049   // we don't have to worry about the insertion order and we can just use the
8050   // builder. At this point we generate the predication tree. There may be
8051   // duplications since this is a simple recursive scan, but future
8052   // optimizations will clean it up.
8053 
8054   SmallVector<VPValue *, 2> Operands;
8055   unsigned NumIncoming = Phi->getNumIncomingValues();
8056   for (unsigned In = 0; In < NumIncoming; In++) {
8057     VPValue *EdgeMask =
8058       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8059     assert((EdgeMask || NumIncoming == 1) &&
8060            "Multiple predecessors with one having a full mask");
8061     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
8062     if (EdgeMask)
8063       Operands.push_back(EdgeMask);
8064   }
8065   return new VPBlendRecipe(Phi, Operands);
8066 }
8067 
8068 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
8069                                                    VPlan &Plan) const {
8070 
8071   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8072       [this, CI](ElementCount VF) {
8073         return CM.isScalarWithPredication(CI, VF);
8074       },
8075       Range);
8076 
8077   if (IsPredicated)
8078     return nullptr;
8079 
8080   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8081   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8082              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8083              ID == Intrinsic::pseudoprobe))
8084     return nullptr;
8085 
8086   auto willWiden = [&](ElementCount VF) -> bool {
8087     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8088     // The following case may be scalarized depending on the VF.
8089     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8090     // version of the instruction.
8091     // Is it beneficial to perform intrinsic call compared to lib call?
8092     bool NeedToScalarize = false;
8093     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8094     bool UseVectorIntrinsic =
8095         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
8096     return UseVectorIntrinsic || !NeedToScalarize;
8097   };
8098 
8099   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8100     return nullptr;
8101 
8102   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
8103 }
8104 
8105 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8106   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8107          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8108   // Instruction should be widened, unless it is scalar after vectorization,
8109   // scalarization is profitable or it is predicated.
8110   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8111     return CM.isScalarAfterVectorization(I, VF) ||
8112            CM.isProfitableToScalarize(I, VF) ||
8113            CM.isScalarWithPredication(I, VF);
8114   };
8115   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8116                                                              Range);
8117 }
8118 
8119 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
8120   auto IsVectorizableOpcode = [](unsigned Opcode) {
8121     switch (Opcode) {
8122     case Instruction::Add:
8123     case Instruction::And:
8124     case Instruction::AShr:
8125     case Instruction::BitCast:
8126     case Instruction::FAdd:
8127     case Instruction::FCmp:
8128     case Instruction::FDiv:
8129     case Instruction::FMul:
8130     case Instruction::FNeg:
8131     case Instruction::FPExt:
8132     case Instruction::FPToSI:
8133     case Instruction::FPToUI:
8134     case Instruction::FPTrunc:
8135     case Instruction::FRem:
8136     case Instruction::FSub:
8137     case Instruction::ICmp:
8138     case Instruction::IntToPtr:
8139     case Instruction::LShr:
8140     case Instruction::Mul:
8141     case Instruction::Or:
8142     case Instruction::PtrToInt:
8143     case Instruction::SDiv:
8144     case Instruction::Select:
8145     case Instruction::SExt:
8146     case Instruction::Shl:
8147     case Instruction::SIToFP:
8148     case Instruction::SRem:
8149     case Instruction::Sub:
8150     case Instruction::Trunc:
8151     case Instruction::UDiv:
8152     case Instruction::UIToFP:
8153     case Instruction::URem:
8154     case Instruction::Xor:
8155     case Instruction::ZExt:
8156       return true;
8157     }
8158     return false;
8159   };
8160 
8161   if (!IsVectorizableOpcode(I->getOpcode()))
8162     return nullptr;
8163 
8164   // Success: widen this instruction.
8165   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
8166 }
8167 
8168 VPBasicBlock *VPRecipeBuilder::handleReplication(
8169     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8170     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
8171     VPlanPtr &Plan) {
8172   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8173       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8174       Range);
8175 
8176   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8177       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
8178       Range);
8179 
8180   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8181                                        IsUniform, IsPredicated);
8182   setRecipe(I, Recipe);
8183   Plan->addVPValue(I, Recipe);
8184 
8185   // Find if I uses a predicated instruction. If so, it will use its scalar
8186   // value. Avoid hoisting the insert-element which packs the scalar value into
8187   // a vector value, as that happens iff all users use the vector value.
8188   for (auto &Op : I->operands())
8189     if (auto *PredInst = dyn_cast<Instruction>(Op))
8190       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
8191         PredInst2Recipe[PredInst]->setAlsoPack(false);
8192 
8193   // Finalize the recipe for Instr, first if it is not predicated.
8194   if (!IsPredicated) {
8195     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8196     VPBB->appendRecipe(Recipe);
8197     return VPBB;
8198   }
8199   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8200   assert(VPBB->getSuccessors().empty() &&
8201          "VPBB has successors when handling predicated replication.");
8202   // Record predicated instructions for above packing optimizations.
8203   PredInst2Recipe[I] = Recipe;
8204   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8205   VPBlockUtils::insertBlockAfter(Region, VPBB);
8206   auto *RegSucc = new VPBasicBlock();
8207   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8208   return RegSucc;
8209 }
8210 
8211 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8212                                                       VPRecipeBase *PredRecipe,
8213                                                       VPlanPtr &Plan) {
8214   // Instructions marked for predication are replicated and placed under an
8215   // if-then construct to prevent side-effects.
8216 
8217   // Generate recipes to compute the block mask for this region.
8218   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8219 
8220   // Build the triangular if-then region.
8221   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8222   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8223   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8224   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8225   auto *PHIRecipe = Instr->getType()->isVoidTy()
8226                         ? nullptr
8227                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8228   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8229   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8230   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8231 
8232   // Note: first set Entry as region entry and then connect successors starting
8233   // from it in order, to propagate the "parent" of each VPBasicBlock.
8234   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8235   VPBlockUtils::connectBlocks(Pred, Exit);
8236 
8237   return Region;
8238 }
8239 
8240 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8241                                                       VFRange &Range,
8242                                                       VPlanPtr &Plan) {
8243   // First, check for specific widening recipes that deal with calls, memory
8244   // operations, inductions and Phi nodes.
8245   if (auto *CI = dyn_cast<CallInst>(Instr))
8246     return tryToWidenCall(CI, Range, *Plan);
8247 
8248   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8249     return tryToWidenMemory(Instr, Range, Plan);
8250 
8251   VPRecipeBase *Recipe;
8252   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8253     if (Phi->getParent() != OrigLoop->getHeader())
8254       return tryToBlend(Phi, Plan);
8255     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
8256       return Recipe;
8257     return new VPWidenPHIRecipe(Phi);
8258   }
8259 
8260   if (isa<TruncInst>(Instr) &&
8261       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
8262     return Recipe;
8263 
8264   if (!shouldWiden(Instr, Range))
8265     return nullptr;
8266 
8267   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8268     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
8269                                 OrigLoop);
8270 
8271   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8272     bool InvariantCond =
8273         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8274     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
8275                                    InvariantCond);
8276   }
8277 
8278   return tryToWiden(Instr, *Plan);
8279 }
8280 
8281 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8282                                                         ElementCount MaxVF) {
8283   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8284 
8285   // Collect instructions from the original loop that will become trivially dead
8286   // in the vectorized loop. We don't need to vectorize these instructions. For
8287   // example, original induction update instructions can become dead because we
8288   // separately emit induction "steps" when generating code for the new loop.
8289   // Similarly, we create a new latch condition when setting up the structure
8290   // of the new loop, so the old one can become dead.
8291   SmallPtrSet<Instruction *, 4> DeadInstructions;
8292   collectTriviallyDeadInstructions(DeadInstructions);
8293 
8294   // Add assume instructions we need to drop to DeadInstructions, to prevent
8295   // them from being added to the VPlan.
8296   // TODO: We only need to drop assumes in blocks that get flattend. If the
8297   // control flow is preserved, we should keep them.
8298   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8299   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8300 
8301   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8302   // Dead instructions do not need sinking. Remove them from SinkAfter.
8303   for (Instruction *I : DeadInstructions)
8304     SinkAfter.erase(I);
8305 
8306   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8307   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8308     VFRange SubRange = {VF, MaxVFPlusOne};
8309     VPlans.push_back(
8310         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8311     VF = SubRange.End;
8312   }
8313 }
8314 
8315 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8316     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8317     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
8318 
8319   // Hold a mapping from predicated instructions to their recipes, in order to
8320   // fix their AlsoPack behavior if a user is determined to replicate and use a
8321   // scalar instead of vector value.
8322   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
8323 
8324   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8325 
8326   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8327 
8328   // ---------------------------------------------------------------------------
8329   // Pre-construction: record ingredients whose recipes we'll need to further
8330   // process after constructing the initial VPlan.
8331   // ---------------------------------------------------------------------------
8332 
8333   // Mark instructions we'll need to sink later and their targets as
8334   // ingredients whose recipe we'll need to record.
8335   for (auto &Entry : SinkAfter) {
8336     RecipeBuilder.recordRecipeOf(Entry.first);
8337     RecipeBuilder.recordRecipeOf(Entry.second);
8338   }
8339   for (auto &Reduction : CM.getInLoopReductionChains()) {
8340     PHINode *Phi = Reduction.first;
8341     RecurrenceDescriptor::RecurrenceKind Kind =
8342         Legal->getReductionVars()[Phi].getRecurrenceKind();
8343     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8344 
8345     RecipeBuilder.recordRecipeOf(Phi);
8346     for (auto &R : ReductionOperations) {
8347       RecipeBuilder.recordRecipeOf(R);
8348       // For min/max reducitons, where we have a pair of icmp/select, we also
8349       // need to record the ICmp recipe, so it can be removed later.
8350       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8351           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8352         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8353       }
8354     }
8355   }
8356 
8357   // For each interleave group which is relevant for this (possibly trimmed)
8358   // Range, add it to the set of groups to be later applied to the VPlan and add
8359   // placeholders for its members' Recipes which we'll be replacing with a
8360   // single VPInterleaveRecipe.
8361   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8362     auto applyIG = [IG, this](ElementCount VF) -> bool {
8363       return (VF.isVector() && // Query is illegal for VF == 1
8364               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8365                   LoopVectorizationCostModel::CM_Interleave);
8366     };
8367     if (!getDecisionAndClampRange(applyIG, Range))
8368       continue;
8369     InterleaveGroups.insert(IG);
8370     for (unsigned i = 0; i < IG->getFactor(); i++)
8371       if (Instruction *Member = IG->getMember(i))
8372         RecipeBuilder.recordRecipeOf(Member);
8373   };
8374 
8375   // ---------------------------------------------------------------------------
8376   // Build initial VPlan: Scan the body of the loop in a topological order to
8377   // visit each basic block after having visited its predecessor basic blocks.
8378   // ---------------------------------------------------------------------------
8379 
8380   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
8381   auto Plan = std::make_unique<VPlan>();
8382   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
8383   Plan->setEntry(VPBB);
8384 
8385   // Scan the body of the loop in a topological order to visit each basic block
8386   // after having visited its predecessor basic blocks.
8387   LoopBlocksDFS DFS(OrigLoop);
8388   DFS.perform(LI);
8389 
8390   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8391     // Relevant instructions from basic block BB will be grouped into VPRecipe
8392     // ingredients and fill a new VPBasicBlock.
8393     unsigned VPBBsForBB = 0;
8394     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8395     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
8396     VPBB = FirstVPBBForBB;
8397     Builder.setInsertPoint(VPBB);
8398 
8399     // Introduce each ingredient into VPlan.
8400     // TODO: Model and preserve debug instrinsics in VPlan.
8401     for (Instruction &I : BB->instructionsWithoutDebug()) {
8402       Instruction *Instr = &I;
8403 
8404       // First filter out irrelevant instructions, to ensure no recipes are
8405       // built for them.
8406       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8407         continue;
8408 
8409       if (auto Recipe =
8410               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
8411         // Check if the recipe can be converted to a VPValue. We need the extra
8412         // down-casting step until VPRecipeBase inherits from VPValue.
8413         VPValue *MaybeVPValue = Recipe->toVPValue();
8414         if (!Instr->getType()->isVoidTy() && MaybeVPValue)
8415           Plan->addVPValue(Instr, MaybeVPValue);
8416 
8417         RecipeBuilder.setRecipe(Instr, Recipe);
8418         VPBB->appendRecipe(Recipe);
8419         continue;
8420       }
8421 
8422       // Otherwise, if all widening options failed, Instruction is to be
8423       // replicated. This may create a successor for VPBB.
8424       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
8425           Instr, Range, VPBB, PredInst2Recipe, Plan);
8426       if (NextVPBB != VPBB) {
8427         VPBB = NextVPBB;
8428         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8429                                     : "");
8430       }
8431     }
8432   }
8433 
8434   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8435   // may also be empty, such as the last one VPBB, reflecting original
8436   // basic-blocks with no recipes.
8437   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8438   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
8439   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8440   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
8441   delete PreEntry;
8442 
8443   // ---------------------------------------------------------------------------
8444   // Transform initial VPlan: Apply previously taken decisions, in order, to
8445   // bring the VPlan to its final state.
8446   // ---------------------------------------------------------------------------
8447 
8448   // Apply Sink-After legal constraints.
8449   for (auto &Entry : SinkAfter) {
8450     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8451     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8452     Sink->moveAfter(Target);
8453   }
8454 
8455   // Interleave memory: for each Interleave Group we marked earlier as relevant
8456   // for this VPlan, replace the Recipes widening its memory instructions with a
8457   // single VPInterleaveRecipe at its insertion point.
8458   for (auto IG : InterleaveGroups) {
8459     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8460         RecipeBuilder.getRecipe(IG->getInsertPos()));
8461     SmallVector<VPValue *, 4> StoredValues;
8462     for (unsigned i = 0; i < IG->getFactor(); ++i)
8463       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
8464         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
8465 
8466     (new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8467                             Recipe->getMask()))
8468         ->insertBefore(Recipe);
8469 
8470     for (unsigned i = 0; i < IG->getFactor(); ++i)
8471       if (Instruction *Member = IG->getMember(i)) {
8472         if (!Member->getType()->isVoidTy()) {
8473           VPValue *OriginalV = Plan->getVPValue(Member);
8474           Plan->removeVPValueFor(Member);
8475           OriginalV->replaceAllUsesWith(Plan->getOrAddVPValue(Member));
8476         }
8477         RecipeBuilder.getRecipe(Member)->eraseFromParent();
8478       }
8479   }
8480 
8481   // Adjust the recipes for any inloop reductions.
8482   if (Range.Start.isVector())
8483     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
8484 
8485   // Finally, if tail is folded by masking, introduce selects between the phi
8486   // and the live-out instruction of each reduction, at the end of the latch.
8487   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
8488     Builder.setInsertPoint(VPBB);
8489     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
8490     for (auto &Reduction : Legal->getReductionVars()) {
8491       if (CM.isInLoopReduction(Reduction.first))
8492         continue;
8493       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
8494       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
8495       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
8496     }
8497   }
8498 
8499   std::string PlanName;
8500   raw_string_ostream RSO(PlanName);
8501   ElementCount VF = Range.Start;
8502   Plan->addVF(VF);
8503   RSO << "Initial VPlan for VF={" << VF;
8504   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
8505     Plan->addVF(VF);
8506     RSO << "," << VF;
8507   }
8508   RSO << "},UF>=1";
8509   RSO.flush();
8510   Plan->setName(PlanName);
8511 
8512   return Plan;
8513 }
8514 
8515 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8516   // Outer loop handling: They may require CFG and instruction level
8517   // transformations before even evaluating whether vectorization is profitable.
8518   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8519   // the vectorization pipeline.
8520   assert(!OrigLoop->isInnermost());
8521   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8522 
8523   // Create new empty VPlan
8524   auto Plan = std::make_unique<VPlan>();
8525 
8526   // Build hierarchical CFG
8527   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8528   HCFGBuilder.buildHierarchicalCFG();
8529 
8530   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
8531        VF *= 2)
8532     Plan->addVF(VF);
8533 
8534   if (EnableVPlanPredication) {
8535     VPlanPredicator VPP(*Plan);
8536     VPP.predicate();
8537 
8538     // Avoid running transformation to recipes until masked code generation in
8539     // VPlan-native path is in place.
8540     return Plan;
8541   }
8542 
8543   SmallPtrSet<Instruction *, 1> DeadInstructions;
8544   VPlanTransforms::VPInstructionsToVPRecipes(
8545       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
8546   return Plan;
8547 }
8548 
8549 // Adjust the recipes for any inloop reductions. The chain of instructions
8550 // leading from the loop exit instr to the phi need to be converted to
8551 // reductions, with one operand being vector and the other being the scalar
8552 // reduction chain.
8553 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
8554     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
8555   for (auto &Reduction : CM.getInLoopReductionChains()) {
8556     PHINode *Phi = Reduction.first;
8557     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8558     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8559 
8560     // ReductionOperations are orders top-down from the phi's use to the
8561     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
8562     // which of the two operands will remain scalar and which will be reduced.
8563     // For minmax the chain will be the select instructions.
8564     Instruction *Chain = Phi;
8565     for (Instruction *R : ReductionOperations) {
8566       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
8567       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
8568 
8569       VPValue *ChainOp = Plan->getVPValue(Chain);
8570       unsigned FirstOpId;
8571       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8572           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8573         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
8574                "Expected to replace a VPWidenSelectSC");
8575         FirstOpId = 1;
8576       } else {
8577         assert(isa<VPWidenRecipe>(WidenRecipe) &&
8578                "Expected to replace a VPWidenSC");
8579         FirstOpId = 0;
8580       }
8581       unsigned VecOpId =
8582           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
8583       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
8584 
8585       auto *CondOp = CM.foldTailByMasking()
8586                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
8587                          : nullptr;
8588       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
8589           &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
8590       WidenRecipe->toVPValue()->replaceAllUsesWith(RedRecipe);
8591       Plan->removeVPValueFor(R);
8592       Plan->addVPValue(R, RedRecipe);
8593       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
8594       WidenRecipe->eraseFromParent();
8595 
8596       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8597           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8598         VPRecipeBase *CompareRecipe =
8599             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
8600         assert(isa<VPWidenRecipe>(CompareRecipe) &&
8601                "Expected to replace a VPWidenSC");
8602         assert(CompareRecipe->toVPValue()->getNumUsers() == 0 &&
8603                "Expected no remaining users");
8604         CompareRecipe->eraseFromParent();
8605       }
8606       Chain = R;
8607     }
8608   }
8609 }
8610 
8611 Value* LoopVectorizationPlanner::VPCallbackILV::
8612 getOrCreateVectorValues(Value *V, unsigned Part) {
8613       return ILV.getOrCreateVectorValue(V, Part);
8614 }
8615 
8616 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
8617     Value *V, const VPIteration &Instance) {
8618   return ILV.getOrCreateScalarValue(V, Instance);
8619 }
8620 
8621 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
8622                                VPSlotTracker &SlotTracker) const {
8623   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
8624   IG->getInsertPos()->printAsOperand(O, false);
8625   O << ", ";
8626   getAddr()->printAsOperand(O, SlotTracker);
8627   VPValue *Mask = getMask();
8628   if (Mask) {
8629     O << ", ";
8630     Mask->printAsOperand(O, SlotTracker);
8631   }
8632   for (unsigned i = 0; i < IG->getFactor(); ++i)
8633     if (Instruction *I = IG->getMember(i))
8634       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
8635 }
8636 
8637 void VPWidenCallRecipe::execute(VPTransformState &State) {
8638   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
8639                                   *this, State);
8640 }
8641 
8642 void VPWidenSelectRecipe::execute(VPTransformState &State) {
8643   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
8644                                     this, *this, InvariantCond, State);
8645 }
8646 
8647 void VPWidenRecipe::execute(VPTransformState &State) {
8648   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
8649 }
8650 
8651 void VPWidenGEPRecipe::execute(VPTransformState &State) {
8652   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
8653                       *this, State.UF, State.VF, IsPtrLoopInvariant,
8654                       IsIndexLoopInvariant, State);
8655 }
8656 
8657 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
8658   assert(!State.Instance && "Int or FP induction being replicated.");
8659   State.ILV->widenIntOrFpInduction(IV, Trunc);
8660 }
8661 
8662 void VPWidenPHIRecipe::execute(VPTransformState &State) {
8663   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
8664 }
8665 
8666 void VPBlendRecipe::execute(VPTransformState &State) {
8667   State.ILV->setDebugLocFromInst(State.Builder, Phi);
8668   // We know that all PHIs in non-header blocks are converted into
8669   // selects, so we don't have to worry about the insertion order and we
8670   // can just use the builder.
8671   // At this point we generate the predication tree. There may be
8672   // duplications since this is a simple recursive scan, but future
8673   // optimizations will clean it up.
8674 
8675   unsigned NumIncoming = getNumIncomingValues();
8676 
8677   // Generate a sequence of selects of the form:
8678   // SELECT(Mask3, In3,
8679   //        SELECT(Mask2, In2,
8680   //               SELECT(Mask1, In1,
8681   //                      In0)))
8682   // Note that Mask0 is never used: lanes for which no path reaches this phi and
8683   // are essentially undef are taken from In0.
8684   InnerLoopVectorizer::VectorParts Entry(State.UF);
8685   for (unsigned In = 0; In < NumIncoming; ++In) {
8686     for (unsigned Part = 0; Part < State.UF; ++Part) {
8687       // We might have single edge PHIs (blocks) - use an identity
8688       // 'select' for the first PHI operand.
8689       Value *In0 = State.get(getIncomingValue(In), Part);
8690       if (In == 0)
8691         Entry[Part] = In0; // Initialize with the first incoming value.
8692       else {
8693         // Select between the current value and the previous incoming edge
8694         // based on the incoming mask.
8695         Value *Cond = State.get(getMask(In), Part);
8696         Entry[Part] =
8697             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8698       }
8699     }
8700   }
8701   for (unsigned Part = 0; Part < State.UF; ++Part)
8702     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8703 }
8704 
8705 void VPInterleaveRecipe::execute(VPTransformState &State) {
8706   assert(!State.Instance && "Interleave group being replicated.");
8707   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getStoredValues(),
8708                                       getMask());
8709 }
8710 
8711 void VPReductionRecipe::execute(VPTransformState &State) {
8712   assert(!State.Instance && "Reduction being replicated.");
8713   for (unsigned Part = 0; Part < State.UF; ++Part) {
8714     RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind();
8715     Value *NewVecOp = State.get(getVecOp(), Part);
8716     if (VPValue *Cond = getCondOp()) {
8717       Value *NewCond = State.get(Cond, Part);
8718       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
8719       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
8720           Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType());
8721       Constant *IdenVec =
8722           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
8723       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
8724       NewVecOp = Select;
8725     }
8726     Value *NewRed =
8727         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
8728     Value *PrevInChain = State.get(getChainOp(), Part);
8729     Value *NextInChain;
8730     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8731         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8732       NextInChain =
8733           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
8734                          NewRed, PrevInChain);
8735     } else {
8736       NextInChain = State.Builder.CreateBinOp(
8737           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
8738           PrevInChain);
8739     }
8740     State.set(this, getUnderlyingInstr(), NextInChain, Part);
8741   }
8742 }
8743 
8744 void VPReplicateRecipe::execute(VPTransformState &State) {
8745   if (State.Instance) { // Generate a single instance.
8746     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
8747     State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
8748                                     *State.Instance, IsPredicated, State);
8749     // Insert scalar instance packing it into a vector.
8750     if (AlsoPack && State.VF.isVector()) {
8751       // If we're constructing lane 0, initialize to start from undef.
8752       if (State.Instance->Lane == 0) {
8753         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8754         Value *Undef = UndefValue::get(
8755             VectorType::get(getUnderlyingValue()->getType(), State.VF));
8756         State.ValueMap.setVectorValue(getUnderlyingInstr(),
8757                                       State.Instance->Part, Undef);
8758       }
8759       State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
8760                                            *State.Instance);
8761     }
8762     return;
8763   }
8764 
8765   // Generate scalar instances for all VF lanes of all UF parts, unless the
8766   // instruction is uniform inwhich case generate only the first lane for each
8767   // of the UF parts.
8768   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8769   assert((!State.VF.isScalable() || IsUniform) &&
8770          "Can't scalarize a scalable vector");
8771   for (unsigned Part = 0; Part < State.UF; ++Part)
8772     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8773       State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
8774                                       IsPredicated, State);
8775 }
8776 
8777 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8778   assert(State.Instance && "Branch on Mask works only on single instance.");
8779 
8780   unsigned Part = State.Instance->Part;
8781   unsigned Lane = State.Instance->Lane;
8782 
8783   Value *ConditionBit = nullptr;
8784   VPValue *BlockInMask = getMask();
8785   if (BlockInMask) {
8786     ConditionBit = State.get(BlockInMask, Part);
8787     if (ConditionBit->getType()->isVectorTy())
8788       ConditionBit = State.Builder.CreateExtractElement(
8789           ConditionBit, State.Builder.getInt32(Lane));
8790   } else // Block in mask is all-one.
8791     ConditionBit = State.Builder.getTrue();
8792 
8793   // Replace the temporary unreachable terminator with a new conditional branch,
8794   // whose two destinations will be set later when they are created.
8795   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8796   assert(isa<UnreachableInst>(CurrentTerminator) &&
8797          "Expected to replace unreachable terminator with conditional branch.");
8798   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8799   CondBr->setSuccessor(0, nullptr);
8800   ReplaceInstWithInst(CurrentTerminator, CondBr);
8801 }
8802 
8803 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8804   assert(State.Instance && "Predicated instruction PHI works per instance.");
8805   Instruction *ScalarPredInst =
8806       cast<Instruction>(State.get(getOperand(0), *State.Instance));
8807   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8808   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8809   assert(PredicatingBB && "Predicated block has no single predecessor.");
8810 
8811   // By current pack/unpack logic we need to generate only a single phi node: if
8812   // a vector value for the predicated instruction exists at this point it means
8813   // the instruction has vector users only, and a phi for the vector value is
8814   // needed. In this case the recipe of the predicated instruction is marked to
8815   // also do that packing, thereby "hoisting" the insert-element sequence.
8816   // Otherwise, a phi node for the scalar value is needed.
8817   unsigned Part = State.Instance->Part;
8818   Instruction *PredInst =
8819       cast<Instruction>(getOperand(0)->getUnderlyingValue());
8820   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8821     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8822     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8823     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8824     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8825     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8826     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8827   } else {
8828     Type *PredInstType = PredInst->getType();
8829     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8830     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8831     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8832     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8833   }
8834 }
8835 
8836 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8837   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
8838   State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
8839                                         StoredValue ? nullptr : toVPValue(),
8840                                         getAddr(), StoredValue, getMask());
8841 }
8842 
8843 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8844 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8845 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8846 // for predication.
8847 static ScalarEpilogueLowering getScalarEpilogueLowering(
8848     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8849     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8850     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8851     LoopVectorizationLegality &LVL) {
8852   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8853   // don't look at hints or options, and don't request a scalar epilogue.
8854   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8855   // LoopAccessInfo (due to code dependency and not being able to reliably get
8856   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8857   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8858   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8859   // back to the old way and vectorize with versioning when forced. See D81345.)
8860   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8861                                                       PGSOQueryType::IRPass) &&
8862                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8863     return CM_ScalarEpilogueNotAllowedOptSize;
8864 
8865   // 2) If set, obey the directives
8866   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
8867     switch (PreferPredicateOverEpilogue) {
8868     case PreferPredicateTy::ScalarEpilogue:
8869       return CM_ScalarEpilogueAllowed;
8870     case PreferPredicateTy::PredicateElseScalarEpilogue:
8871       return CM_ScalarEpilogueNotNeededUsePredicate;
8872     case PreferPredicateTy::PredicateOrDontVectorize:
8873       return CM_ScalarEpilogueNotAllowedUsePredicate;
8874     };
8875   }
8876 
8877   // 3) If set, obey the hints
8878   switch (Hints.getPredicate()) {
8879   case LoopVectorizeHints::FK_Enabled:
8880     return CM_ScalarEpilogueNotNeededUsePredicate;
8881   case LoopVectorizeHints::FK_Disabled:
8882     return CM_ScalarEpilogueAllowed;
8883   };
8884 
8885   // 4) if the TTI hook indicates this is profitable, request predication.
8886   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8887                                        LVL.getLAI()))
8888     return CM_ScalarEpilogueNotNeededUsePredicate;
8889 
8890   return CM_ScalarEpilogueAllowed;
8891 }
8892 
8893 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
8894                            unsigned Part) {
8895   set(Def, V, Part);
8896   ILV->setVectorValue(IRDef, Part, V);
8897 }
8898 
8899 // Process the loop in the VPlan-native vectorization path. This path builds
8900 // VPlan upfront in the vectorization pipeline, which allows to apply
8901 // VPlan-to-VPlan transformations from the very beginning without modifying the
8902 // input LLVM IR.
8903 static bool processLoopInVPlanNativePath(
8904     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8905     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8906     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8907     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8908     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8909 
8910   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
8911     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8912     return false;
8913   }
8914   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8915   Function *F = L->getHeader()->getParent();
8916   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8917 
8918   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8919       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8920 
8921   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8922                                 &Hints, IAI);
8923   // Use the planner for outer loop vectorization.
8924   // TODO: CM is not used at this point inside the planner. Turn CM into an
8925   // optional argument if we don't need it in the future.
8926   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8927 
8928   // Get user vectorization factor.
8929   ElementCount UserVF = Hints.getWidth();
8930 
8931   // Plan how to best vectorize, return the best VF and its cost.
8932   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
8933 
8934   // If we are stress testing VPlan builds, do not attempt to generate vector
8935   // code. Masked vector code generation support will follow soon.
8936   // Also, do not attempt to vectorize if no vector code will be produced.
8937   if (VPlanBuildStressTest || EnableVPlanPredication ||
8938       VectorizationFactor::Disabled() == VF)
8939     return false;
8940 
8941   LVP.setBestPlan(VF.Width, 1);
8942 
8943   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8944                          &CM, BFI, PSI);
8945   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8946                     << L->getHeader()->getParent()->getName() << "\"\n");
8947   LVP.executePlan(LB, DT);
8948 
8949   // Mark the loop as already vectorized to avoid vectorizing again.
8950   Hints.setAlreadyVectorized();
8951 
8952   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8953   return true;
8954 }
8955 
8956 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8957     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8958                                !EnableLoopInterleaving),
8959       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8960                               !EnableLoopVectorization) {}
8961 
8962 bool LoopVectorizePass::processLoop(Loop *L) {
8963   assert((EnableVPlanNativePath || L->isInnermost()) &&
8964          "VPlan-native path is not enabled. Only process inner loops.");
8965 
8966 #ifndef NDEBUG
8967   const std::string DebugLocStr = getDebugLocString(L);
8968 #endif /* NDEBUG */
8969 
8970   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8971                     << L->getHeader()->getParent()->getName() << "\" from "
8972                     << DebugLocStr << "\n");
8973 
8974   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8975 
8976   LLVM_DEBUG(
8977       dbgs() << "LV: Loop hints:"
8978              << " force="
8979              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8980                      ? "disabled"
8981                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8982                             ? "enabled"
8983                             : "?"))
8984              << " width=" << Hints.getWidth()
8985              << " unroll=" << Hints.getInterleave() << "\n");
8986 
8987   // Function containing loop
8988   Function *F = L->getHeader()->getParent();
8989 
8990   // Looking at the diagnostic output is the only way to determine if a loop
8991   // was vectorized (other than looking at the IR or machine code), so it
8992   // is important to generate an optimization remark for each loop. Most of
8993   // these messages are generated as OptimizationRemarkAnalysis. Remarks
8994   // generated as OptimizationRemark and OptimizationRemarkMissed are
8995   // less verbose reporting vectorized loops and unvectorized loops that may
8996   // benefit from vectorization, respectively.
8997 
8998   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
8999     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9000     return false;
9001   }
9002 
9003   PredicatedScalarEvolution PSE(*SE, *L);
9004 
9005   // Check if it is legal to vectorize the loop.
9006   LoopVectorizationRequirements Requirements(*ORE);
9007   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
9008                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9009   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9010     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9011     Hints.emitRemarkWithHints();
9012     return false;
9013   }
9014 
9015   // Check the function attributes and profiles to find out if this function
9016   // should be optimized for size.
9017   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9018       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
9019 
9020   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9021   // here. They may require CFG and instruction level transformations before
9022   // even evaluating whether vectorization is profitable. Since we cannot modify
9023   // the incoming IR, we need to build VPlan upfront in the vectorization
9024   // pipeline.
9025   if (!L->isInnermost())
9026     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9027                                         ORE, BFI, PSI, Hints);
9028 
9029   assert(L->isInnermost() && "Inner loop expected.");
9030 
9031   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9032   // count by optimizing for size, to minimize overheads.
9033   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9034   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9035     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9036                       << "This loop is worth vectorizing only if no scalar "
9037                       << "iteration overheads are incurred.");
9038     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9039       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9040     else {
9041       LLVM_DEBUG(dbgs() << "\n");
9042       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9043     }
9044   }
9045 
9046   // Check the function attributes to see if implicit floats are allowed.
9047   // FIXME: This check doesn't seem possibly correct -- what if the loop is
9048   // an integer loop and the vector instructions selected are purely integer
9049   // vector instructions?
9050   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9051     reportVectorizationFailure(
9052         "Can't vectorize when the NoImplicitFloat attribute is used",
9053         "loop not vectorized due to NoImplicitFloat attribute",
9054         "NoImplicitFloat", ORE, L);
9055     Hints.emitRemarkWithHints();
9056     return false;
9057   }
9058 
9059   // Check if the target supports potentially unsafe FP vectorization.
9060   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9061   // for the target we're vectorizing for, to make sure none of the
9062   // additional fp-math flags can help.
9063   if (Hints.isPotentiallyUnsafe() &&
9064       TTI->isFPVectorizationPotentiallyUnsafe()) {
9065     reportVectorizationFailure(
9066         "Potentially unsafe FP op prevents vectorization",
9067         "loop not vectorized due to unsafe FP support.",
9068         "UnsafeFP", ORE, L);
9069     Hints.emitRemarkWithHints();
9070     return false;
9071   }
9072 
9073   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9074   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9075 
9076   // If an override option has been passed in for interleaved accesses, use it.
9077   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9078     UseInterleaved = EnableInterleavedMemAccesses;
9079 
9080   // Analyze interleaved memory accesses.
9081   if (UseInterleaved) {
9082     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9083   }
9084 
9085   // Use the cost model.
9086   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9087                                 F, &Hints, IAI);
9088   CM.collectValuesToIgnore();
9089 
9090   // Use the planner for vectorization.
9091   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
9092 
9093   // Get user vectorization factor and interleave count.
9094   ElementCount UserVF = Hints.getWidth();
9095   unsigned UserIC = Hints.getInterleave();
9096 
9097   // Plan how to best vectorize, return the best VF and its cost.
9098   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9099 
9100   VectorizationFactor VF = VectorizationFactor::Disabled();
9101   unsigned IC = 1;
9102 
9103   if (MaybeVF) {
9104     VF = *MaybeVF;
9105     // Select the interleave count.
9106     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9107   }
9108 
9109   // Identify the diagnostic messages that should be produced.
9110   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9111   bool VectorizeLoop = true, InterleaveLoop = true;
9112   if (Requirements.doesNotMeet(F, L, Hints)) {
9113     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
9114                          "requirements.\n");
9115     Hints.emitRemarkWithHints();
9116     return false;
9117   }
9118 
9119   if (VF.Width.isScalar()) {
9120     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9121     VecDiagMsg = std::make_pair(
9122         "VectorizationNotBeneficial",
9123         "the cost-model indicates that vectorization is not beneficial");
9124     VectorizeLoop = false;
9125   }
9126 
9127   if (!MaybeVF && UserIC > 1) {
9128     // Tell the user interleaving was avoided up-front, despite being explicitly
9129     // requested.
9130     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9131                          "interleaving should be avoided up front\n");
9132     IntDiagMsg = std::make_pair(
9133         "InterleavingAvoided",
9134         "Ignoring UserIC, because interleaving was avoided up front");
9135     InterleaveLoop = false;
9136   } else if (IC == 1 && UserIC <= 1) {
9137     // Tell the user interleaving is not beneficial.
9138     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9139     IntDiagMsg = std::make_pair(
9140         "InterleavingNotBeneficial",
9141         "the cost-model indicates that interleaving is not beneficial");
9142     InterleaveLoop = false;
9143     if (UserIC == 1) {
9144       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9145       IntDiagMsg.second +=
9146           " and is explicitly disabled or interleave count is set to 1";
9147     }
9148   } else if (IC > 1 && UserIC == 1) {
9149     // Tell the user interleaving is beneficial, but it explicitly disabled.
9150     LLVM_DEBUG(
9151         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9152     IntDiagMsg = std::make_pair(
9153         "InterleavingBeneficialButDisabled",
9154         "the cost-model indicates that interleaving is beneficial "
9155         "but is explicitly disabled or interleave count is set to 1");
9156     InterleaveLoop = false;
9157   }
9158 
9159   // Override IC if user provided an interleave count.
9160   IC = UserIC > 0 ? UserIC : IC;
9161 
9162   // Emit diagnostic messages, if any.
9163   const char *VAPassName = Hints.vectorizeAnalysisPassName();
9164   if (!VectorizeLoop && !InterleaveLoop) {
9165     // Do not vectorize or interleaving the loop.
9166     ORE->emit([&]() {
9167       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9168                                       L->getStartLoc(), L->getHeader())
9169              << VecDiagMsg.second;
9170     });
9171     ORE->emit([&]() {
9172       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9173                                       L->getStartLoc(), L->getHeader())
9174              << IntDiagMsg.second;
9175     });
9176     return false;
9177   } else if (!VectorizeLoop && InterleaveLoop) {
9178     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9179     ORE->emit([&]() {
9180       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9181                                         L->getStartLoc(), L->getHeader())
9182              << VecDiagMsg.second;
9183     });
9184   } else if (VectorizeLoop && !InterleaveLoop) {
9185     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9186                       << ") in " << DebugLocStr << '\n');
9187     ORE->emit([&]() {
9188       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9189                                         L->getStartLoc(), L->getHeader())
9190              << IntDiagMsg.second;
9191     });
9192   } else if (VectorizeLoop && InterleaveLoop) {
9193     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9194                       << ") in " << DebugLocStr << '\n');
9195     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9196   }
9197 
9198   LVP.setBestPlan(VF.Width, IC);
9199 
9200   using namespace ore;
9201   bool DisableRuntimeUnroll = false;
9202   MDNode *OrigLoopID = L->getLoopID();
9203 
9204   if (!VectorizeLoop) {
9205     assert(IC > 1 && "interleave count should not be 1 or 0");
9206     // If we decided that it is not legal to vectorize the loop, then
9207     // interleave it.
9208     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
9209                                BFI, PSI);
9210     LVP.executePlan(Unroller, DT);
9211 
9212     ORE->emit([&]() {
9213       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9214                                 L->getHeader())
9215              << "interleaved loop (interleaved count: "
9216              << NV("InterleaveCount", IC) << ")";
9217     });
9218   } else {
9219     // If we decided that it is *legal* to vectorize the loop, then do it.
9220 
9221     // Consider vectorizing the epilogue too if it's profitable.
9222     VectorizationFactor EpilogueVF =
9223       CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
9224     if (EpilogueVF.Width.isVector()) {
9225 
9226       // The first pass vectorizes the main loop and creates a scalar epilogue
9227       // to be vectorized by executing the plan (potentially with a different
9228       // factor) again shortly afterwards.
9229       EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
9230                                         EpilogueVF.Width.getKnownMinValue(), 1);
9231       EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
9232                                          &LVL, &CM, BFI, PSI);
9233 
9234       LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
9235       LVP.executePlan(MainILV, DT);
9236       ++LoopsVectorized;
9237 
9238       simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9239       formLCSSARecursively(*L, *DT, LI, SE);
9240 
9241       // Second pass vectorizes the epilogue and adjusts the control flow
9242       // edges from the first pass.
9243       LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
9244       EPI.MainLoopVF = EPI.EpilogueVF;
9245       EPI.MainLoopUF = EPI.EpilogueUF;
9246       EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9247                                                ORE, EPI, &LVL, &CM, BFI, PSI);
9248       LVP.executePlan(EpilogILV, DT);
9249       ++LoopsEpilogueVectorized;
9250 
9251       if (!MainILV.areSafetyChecksAdded())
9252         DisableRuntimeUnroll = true;
9253     } else {
9254       InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
9255                              &LVL, &CM, BFI, PSI);
9256       LVP.executePlan(LB, DT);
9257       ++LoopsVectorized;
9258 
9259       // Add metadata to disable runtime unrolling a scalar loop when there are
9260       // no runtime checks about strides and memory. A scalar loop that is
9261       // rarely used is not worth unrolling.
9262       if (!LB.areSafetyChecksAdded())
9263         DisableRuntimeUnroll = true;
9264     }
9265 
9266     // Report the vectorization decision.
9267     ORE->emit([&]() {
9268       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
9269                                 L->getHeader())
9270              << "vectorized loop (vectorization width: "
9271              << NV("VectorizationFactor", VF.Width)
9272              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
9273     });
9274   }
9275 
9276   Optional<MDNode *> RemainderLoopID =
9277       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
9278                                       LLVMLoopVectorizeFollowupEpilogue});
9279   if (RemainderLoopID.hasValue()) {
9280     L->setLoopID(RemainderLoopID.getValue());
9281   } else {
9282     if (DisableRuntimeUnroll)
9283       AddRuntimeUnrollDisableMetaData(L);
9284 
9285     // Mark the loop as already vectorized to avoid vectorizing again.
9286     Hints.setAlreadyVectorized();
9287   }
9288 
9289   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9290   return true;
9291 }
9292 
9293 LoopVectorizeResult LoopVectorizePass::runImpl(
9294     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
9295     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
9296     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
9297     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
9298     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
9299   SE = &SE_;
9300   LI = &LI_;
9301   TTI = &TTI_;
9302   DT = &DT_;
9303   BFI = &BFI_;
9304   TLI = TLI_;
9305   AA = &AA_;
9306   AC = &AC_;
9307   GetLAA = &GetLAA_;
9308   DB = &DB_;
9309   ORE = &ORE_;
9310   PSI = PSI_;
9311 
9312   // Don't attempt if
9313   // 1. the target claims to have no vector registers, and
9314   // 2. interleaving won't help ILP.
9315   //
9316   // The second condition is necessary because, even if the target has no
9317   // vector registers, loop vectorization may still enable scalar
9318   // interleaving.
9319   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9320       TTI->getMaxInterleaveFactor(1) < 2)
9321     return LoopVectorizeResult(false, false);
9322 
9323   bool Changed = false, CFGChanged = false;
9324 
9325   // The vectorizer requires loops to be in simplified form.
9326   // Since simplification may add new inner loops, it has to run before the
9327   // legality and profitability checks. This means running the loop vectorizer
9328   // will simplify all loops, regardless of whether anything end up being
9329   // vectorized.
9330   for (auto &L : *LI)
9331     Changed |= CFGChanged |=
9332         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9333 
9334   // Build up a worklist of inner-loops to vectorize. This is necessary as
9335   // the act of vectorizing or partially unrolling a loop creates new loops
9336   // and can invalidate iterators across the loops.
9337   SmallVector<Loop *, 8> Worklist;
9338 
9339   for (Loop *L : *LI)
9340     collectSupportedLoops(*L, LI, ORE, Worklist);
9341 
9342   LoopsAnalyzed += Worklist.size();
9343 
9344   // Now walk the identified inner loops.
9345   while (!Worklist.empty()) {
9346     Loop *L = Worklist.pop_back_val();
9347 
9348     // For the inner loops we actually process, form LCSSA to simplify the
9349     // transform.
9350     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
9351 
9352     Changed |= CFGChanged |= processLoop(L);
9353   }
9354 
9355   // Process each loop nest in the function.
9356   return LoopVectorizeResult(Changed, CFGChanged);
9357 }
9358 
9359 PreservedAnalyses LoopVectorizePass::run(Function &F,
9360                                          FunctionAnalysisManager &AM) {
9361     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
9362     auto &LI = AM.getResult<LoopAnalysis>(F);
9363     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
9364     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
9365     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
9366     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
9367     auto &AA = AM.getResult<AAManager>(F);
9368     auto &AC = AM.getResult<AssumptionAnalysis>(F);
9369     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
9370     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
9371     MemorySSA *MSSA = EnableMSSALoopDependency
9372                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
9373                           : nullptr;
9374 
9375     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
9376     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
9377         [&](Loop &L) -> const LoopAccessInfo & {
9378       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
9379                                         TLI, TTI, nullptr, MSSA};
9380       return LAM.getResult<LoopAccessAnalysis>(L, AR);
9381     };
9382     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
9383     ProfileSummaryInfo *PSI =
9384         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
9385     LoopVectorizeResult Result =
9386         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
9387     if (!Result.MadeAnyChange)
9388       return PreservedAnalyses::all();
9389     PreservedAnalyses PA;
9390 
9391     // We currently do not preserve loopinfo/dominator analyses with outer loop
9392     // vectorization. Until this is addressed, mark these analyses as preserved
9393     // only for non-VPlan-native path.
9394     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
9395     if (!EnableVPlanNativePath) {
9396       PA.preserve<LoopAnalysis>();
9397       PA.preserve<DominatorTreeAnalysis>();
9398     }
9399     PA.preserve<BasicAA>();
9400     PA.preserve<GlobalsAA>();
9401     if (!Result.MadeCFGChange)
9402       PA.preserveSet<CFGAnalyses>();
9403     return PA;
9404 }
9405