1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 #ifndef NDEBUG
161 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
162 #endif
163 
164 /// @{
165 /// Metadata attribute names
166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167 const char LLVMLoopVectorizeFollowupVectorized[] =
168     "llvm.loop.vectorize.followup_vectorized";
169 const char LLVMLoopVectorizeFollowupEpilogue[] =
170     "llvm.loop.vectorize.followup_epilogue";
171 /// @}
172 
173 STATISTIC(LoopsVectorized, "Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176 
177 static cl::opt<bool> EnableEpilogueVectorization(
178     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179     cl::desc("Enable vectorization of epilogue loops."));
180 
181 static cl::opt<unsigned> EpilogueVectorizationForceVF(
182     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183     cl::desc("When epilogue vectorization is enabled, and a value greater than "
184              "1 is specified, forces the given VF for all applicable epilogue "
185              "loops."));
186 
187 static cl::opt<unsigned> EpilogueVectorizationMinVF(
188     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189     cl::desc("Only loops with vectorization factor equal to or larger than "
190              "the specified value are considered for epilogue vectorization."));
191 
192 /// Loops with a known constant trip count below this number are vectorized only
193 /// if no scalar iteration overheads are incurred.
194 static cl::opt<unsigned> TinyTripCountVectorThreshold(
195     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196     cl::desc("Loops with a constant trip count that is smaller than this "
197              "value are vectorized only if no scalar iteration overheads "
198              "are incurred."));
199 
200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
201 // that predication is preferred, and this lists all options. I.e., the
202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
203 // and predicate the instructions accordingly. If tail-folding fails, there are
204 // different fallback strategies depending on these values:
205 namespace PreferPredicateTy {
206   enum Option {
207     ScalarEpilogue = 0,
208     PredicateElseScalarEpilogue,
209     PredicateOrDontVectorize
210   };
211 } // namespace PreferPredicateTy
212 
213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
214     "prefer-predicate-over-epilogue",
215     cl::init(PreferPredicateTy::ScalarEpilogue),
216     cl::Hidden,
217     cl::desc("Tail-folding and predication preferences over creating a scalar "
218              "epilogue loop."),
219     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
220                          "scalar-epilogue",
221                          "Don't tail-predicate loops, create scalar epilogue"),
222               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
223                          "predicate-else-scalar-epilogue",
224                          "prefer tail-folding, create scalar epilogue if tail "
225                          "folding fails."),
226               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
227                          "predicate-dont-vectorize",
228                          "prefers tail-folding, don't attempt vectorization if "
229                          "tail-folding fails.")));
230 
231 static cl::opt<bool> MaximizeBandwidth(
232     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
233     cl::desc("Maximize bandwidth when selecting vectorization factor which "
234              "will be determined by the smallest type in loop."));
235 
236 static cl::opt<bool> EnableInterleavedMemAccesses(
237     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
238     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
239 
240 /// An interleave-group may need masking if it resides in a block that needs
241 /// predication, or in order to mask away gaps.
242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
243     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
245 
246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
247     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
248     cl::desc("We don't interleave loops with a estimated constant trip count "
249              "below this number"));
250 
251 static cl::opt<unsigned> ForceTargetNumScalarRegs(
252     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
253     cl::desc("A flag that overrides the target's number of scalar registers."));
254 
255 static cl::opt<unsigned> ForceTargetNumVectorRegs(
256     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of vector registers."));
258 
259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
260     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's max interleave factor for "
262              "scalar loops."));
263 
264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
265     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
266     cl::desc("A flag that overrides the target's max interleave factor for "
267              "vectorized loops."));
268 
269 static cl::opt<unsigned> ForceTargetInstructionCost(
270     "force-target-instruction-cost", cl::init(0), cl::Hidden,
271     cl::desc("A flag that overrides the target's expected cost for "
272              "an instruction to a single constant value. Mostly "
273              "useful for getting consistent testing."));
274 
275 static cl::opt<unsigned> SmallLoopCost(
276     "small-loop-cost", cl::init(20), cl::Hidden,
277     cl::desc(
278         "The cost of a loop that is considered 'small' by the interleaver."));
279 
280 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
281     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
282     cl::desc("Enable the use of the block frequency analysis to access PGO "
283              "heuristics minimizing code growth in cold regions and being more "
284              "aggressive in hot regions."));
285 
286 // Runtime interleave loops for load/store throughput.
287 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
288     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
289     cl::desc(
290         "Enable runtime interleaving until load/store ports are saturated"));
291 
292 /// Interleave small loops with scalar reductions.
293 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
294     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
295     cl::desc("Enable interleaving for loops with small iteration counts that "
296              "contain scalar reductions to expose ILP."));
297 
298 /// The number of stores in a loop that are allowed to need predication.
299 static cl::opt<unsigned> NumberOfStoresToPredicate(
300     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
301     cl::desc("Max number of stores to be predicated behind an if."));
302 
303 static cl::opt<bool> EnableIndVarRegisterHeur(
304     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
305     cl::desc("Count the induction variable only once when interleaving"));
306 
307 static cl::opt<bool> EnableCondStoresVectorization(
308     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
309     cl::desc("Enable if predication of stores during vectorization."));
310 
311 static cl::opt<unsigned> MaxNestedScalarReductionIC(
312     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
313     cl::desc("The maximum interleave count to use when interleaving a scalar "
314              "reduction in a nested loop."));
315 
316 static cl::opt<bool>
317     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
318                            cl::Hidden,
319                            cl::desc("Prefer in-loop vector reductions, "
320                                     "overriding the targets preference."));
321 
322 static cl::opt<bool> PreferPredicatedReductionSelect(
323     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
324     cl::desc(
325         "Prefer predicating a reduction operation over an after loop select."));
326 
327 cl::opt<bool> EnableVPlanNativePath(
328     "enable-vplan-native-path", cl::init(false), cl::Hidden,
329     cl::desc("Enable VPlan-native vectorization path with "
330              "support for outer loop vectorization."));
331 
332 // FIXME: Remove this switch once we have divergence analysis. Currently we
333 // assume divergent non-backedge branches when this switch is true.
334 cl::opt<bool> EnableVPlanPredication(
335     "enable-vplan-predication", cl::init(false), cl::Hidden,
336     cl::desc("Enable VPlan-native vectorization path predicator with "
337              "support for outer loop vectorization."));
338 
339 // This flag enables the stress testing of the VPlan H-CFG construction in the
340 // VPlan-native vectorization path. It must be used in conjuction with
341 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
342 // verification of the H-CFGs built.
343 static cl::opt<bool> VPlanBuildStressTest(
344     "vplan-build-stress-test", cl::init(false), cl::Hidden,
345     cl::desc(
346         "Build VPlan for every supported loop nest in the function and bail "
347         "out right after the build (stress test the VPlan H-CFG construction "
348         "in the VPlan-native vectorization path)."));
349 
350 cl::opt<bool> llvm::EnableLoopInterleaving(
351     "interleave-loops", cl::init(true), cl::Hidden,
352     cl::desc("Enable loop interleaving in Loop vectorization passes"));
353 cl::opt<bool> llvm::EnableLoopVectorization(
354     "vectorize-loops", cl::init(true), cl::Hidden,
355     cl::desc("Run the Loop vectorization passes"));
356 
357 /// A helper function that returns the type of loaded or stored value.
358 static Type *getMemInstValueType(Value *I) {
359   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
360          "Expected Load or Store instruction");
361   if (auto *LI = dyn_cast<LoadInst>(I))
362     return LI->getType();
363   return cast<StoreInst>(I)->getValueOperand()->getType();
364 }
365 
366 /// A helper function that returns true if the given type is irregular. The
367 /// type is irregular if its allocated size doesn't equal the store size of an
368 /// element of the corresponding vector type at the given vectorization factor.
369 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
370   // Determine if an array of VF elements of type Ty is "bitcast compatible"
371   // with a <VF x Ty> vector.
372   if (VF.isVector()) {
373     auto *VectorTy = VectorType::get(Ty, VF);
374     return TypeSize::get(VF.getKnownMinValue() *
375                              DL.getTypeAllocSize(Ty).getFixedValue(),
376                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
377   }
378 
379   // If the vectorization factor is one, we just check if an array of type Ty
380   // requires padding between elements.
381   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
382 }
383 
384 /// A helper function that returns the reciprocal of the block probability of
385 /// predicated blocks. If we return X, we are assuming the predicated block
386 /// will execute once for every X iterations of the loop header.
387 ///
388 /// TODO: We should use actual block probability here, if available. Currently,
389 ///       we always assume predicated blocks have a 50% chance of executing.
390 static unsigned getReciprocalPredBlockProb() { return 2; }
391 
392 /// A helper function that adds a 'fast' flag to floating-point operations.
393 static Value *addFastMathFlag(Value *V) {
394   if (isa<FPMathOperator>(V))
395     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
396   return V;
397 }
398 
399 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
400   if (isa<FPMathOperator>(V))
401     cast<Instruction>(V)->setFastMathFlags(FMF);
402   return V;
403 }
404 
405 /// A helper function that returns an integer or floating-point constant with
406 /// value C.
407 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
408   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
409                            : ConstantFP::get(Ty, C);
410 }
411 
412 /// Returns "best known" trip count for the specified loop \p L as defined by
413 /// the following procedure:
414 ///   1) Returns exact trip count if it is known.
415 ///   2) Returns expected trip count according to profile data if any.
416 ///   3) Returns upper bound estimate if it is known.
417 ///   4) Returns None if all of the above failed.
418 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
419   // Check if exact trip count is known.
420   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
421     return ExpectedTC;
422 
423   // Check if there is an expected trip count available from profile data.
424   if (LoopVectorizeWithBlockFrequency)
425     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
426       return EstimatedTC;
427 
428   // Check if upper bound estimate is known.
429   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
430     return ExpectedTC;
431 
432   return None;
433 }
434 
435 namespace llvm {
436 
437 /// InnerLoopVectorizer vectorizes loops which contain only one basic
438 /// block to a specified vectorization factor (VF).
439 /// This class performs the widening of scalars into vectors, or multiple
440 /// scalars. This class also implements the following features:
441 /// * It inserts an epilogue loop for handling loops that don't have iteration
442 ///   counts that are known to be a multiple of the vectorization factor.
443 /// * It handles the code generation for reduction variables.
444 /// * Scalarization (implementation using scalars) of un-vectorizable
445 ///   instructions.
446 /// InnerLoopVectorizer does not perform any vectorization-legality
447 /// checks, and relies on the caller to check for the different legality
448 /// aspects. The InnerLoopVectorizer relies on the
449 /// LoopVectorizationLegality class to provide information about the induction
450 /// and reduction variables that were found to a given vectorization factor.
451 class InnerLoopVectorizer {
452 public:
453   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
454                       LoopInfo *LI, DominatorTree *DT,
455                       const TargetLibraryInfo *TLI,
456                       const TargetTransformInfo *TTI, AssumptionCache *AC,
457                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
458                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
459                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
460                       ProfileSummaryInfo *PSI)
461       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
462         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
463         Builder(PSE.getSE()->getContext()),
464         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
465         BFI(BFI), PSI(PSI) {
466     // Query this against the original loop and save it here because the profile
467     // of the original loop header may change as the transformation happens.
468     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
469         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
470   }
471 
472   virtual ~InnerLoopVectorizer() = default;
473 
474   /// Create a new empty loop that will contain vectorized instructions later
475   /// on, while the old loop will be used as the scalar remainder. Control flow
476   /// is generated around the vectorized (and scalar epilogue) loops consisting
477   /// of various checks and bypasses. Return the pre-header block of the new
478   /// loop.
479   /// In the case of epilogue vectorization, this function is overriden to
480   /// handle the more complex control flow around the loops.
481   virtual BasicBlock *createVectorizedLoopSkeleton();
482 
483   /// Widen a single instruction within the innermost loop.
484   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
485                         VPTransformState &State);
486 
487   /// Widen a single call instruction within the innermost loop.
488   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
489                             VPTransformState &State);
490 
491   /// Widen a single select instruction within the innermost loop.
492   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
493                               bool InvariantCond, VPTransformState &State);
494 
495   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
496   void fixVectorizedLoop();
497 
498   // Return true if any runtime check is added.
499   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
500 
501   /// A type for vectorized values in the new loop. Each value from the
502   /// original loop, when vectorized, is represented by UF vector values in the
503   /// new unrolled loop, where UF is the unroll factor.
504   using VectorParts = SmallVector<Value *, 2>;
505 
506   /// Vectorize a single GetElementPtrInst based on information gathered and
507   /// decisions taken during planning.
508   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
509                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
510                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
511 
512   /// Vectorize a single PHINode in a block. This method handles the induction
513   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
514   /// arbitrary length vectors.
515   void widenPHIInstruction(Instruction *PN, unsigned UF, ElementCount VF);
516 
517   /// A helper function to scalarize a single Instruction in the innermost loop.
518   /// Generates a sequence of scalar instances for each lane between \p MinLane
519   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
520   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
521   /// Instr's operands.
522   void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
523                             const VPIteration &Instance, bool IfPredicateInstr,
524                             VPTransformState &State);
525 
526   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
527   /// is provided, the integer induction variable will first be truncated to
528   /// the corresponding type.
529   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
530 
531   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
532   /// vector or scalar value on-demand if one is not yet available. When
533   /// vectorizing a loop, we visit the definition of an instruction before its
534   /// uses. When visiting the definition, we either vectorize or scalarize the
535   /// instruction, creating an entry for it in the corresponding map. (In some
536   /// cases, such as induction variables, we will create both vector and scalar
537   /// entries.) Then, as we encounter uses of the definition, we derive values
538   /// for each scalar or vector use unless such a value is already available.
539   /// For example, if we scalarize a definition and one of its uses is vector,
540   /// we build the required vector on-demand with an insertelement sequence
541   /// when visiting the use. Otherwise, if the use is scalar, we can use the
542   /// existing scalar definition.
543   ///
544   /// Return a value in the new loop corresponding to \p V from the original
545   /// loop at unroll index \p Part. If the value has already been vectorized,
546   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
547   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
548   /// a new vector value on-demand by inserting the scalar values into a vector
549   /// with an insertelement sequence. If the value has been neither vectorized
550   /// nor scalarized, it must be loop invariant, so we simply broadcast the
551   /// value into a vector.
552   Value *getOrCreateVectorValue(Value *V, unsigned Part);
553 
554   void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
555     VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
556   }
557 
558   /// Return a value in the new loop corresponding to \p V from the original
559   /// loop at unroll and vector indices \p Instance. If the value has been
560   /// vectorized but not scalarized, the necessary extractelement instruction
561   /// will be generated.
562   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
563 
564   /// Construct the vector value of a scalarized value \p V one lane at a time.
565   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
566 
567   /// Try to vectorize interleaved access group \p Group with the base address
568   /// given in \p Addr, optionally masking the vector operations if \p
569   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
570   /// values in the vectorized loop.
571   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
572                                 VPTransformState &State, VPValue *Addr,
573                                 ArrayRef<VPValue *> StoredValues,
574                                 VPValue *BlockInMask = nullptr);
575 
576   /// Vectorize Load and Store instructions with the base address given in \p
577   /// Addr, optionally masking the vector operations if \p BlockInMask is
578   /// non-null. Use \p State to translate given VPValues to IR values in the
579   /// vectorized loop.
580   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
581                                   VPValue *Def, VPValue *Addr,
582                                   VPValue *StoredValue, VPValue *BlockInMask);
583 
584   /// Set the debug location in the builder using the debug location in
585   /// the instruction.
586   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
587 
588   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
589   void fixNonInductionPHIs(void);
590 
591 protected:
592   friend class LoopVectorizationPlanner;
593 
594   /// A small list of PHINodes.
595   using PhiVector = SmallVector<PHINode *, 4>;
596 
597   /// A type for scalarized values in the new loop. Each value from the
598   /// original loop, when scalarized, is represented by UF x VF scalar values
599   /// in the new unrolled loop, where UF is the unroll factor and VF is the
600   /// vectorization factor.
601   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
602 
603   /// Set up the values of the IVs correctly when exiting the vector loop.
604   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
605                     Value *CountRoundDown, Value *EndValue,
606                     BasicBlock *MiddleBlock);
607 
608   /// Create a new induction variable inside L.
609   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
610                                    Value *Step, Instruction *DL);
611 
612   /// Handle all cross-iteration phis in the header.
613   void fixCrossIterationPHIs();
614 
615   /// Fix a first-order recurrence. This is the second phase of vectorizing
616   /// this phi node.
617   void fixFirstOrderRecurrence(PHINode *Phi);
618 
619   /// Fix a reduction cross-iteration phi. This is the second phase of
620   /// vectorizing this phi node.
621   void fixReduction(PHINode *Phi);
622 
623   /// Clear NSW/NUW flags from reduction instructions if necessary.
624   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
625 
626   /// The Loop exit block may have single value PHI nodes with some
627   /// incoming value. While vectorizing we only handled real values
628   /// that were defined inside the loop and we should have one value for
629   /// each predecessor of its parent basic block. See PR14725.
630   void fixLCSSAPHIs();
631 
632   /// Iteratively sink the scalarized operands of a predicated instruction into
633   /// the block that was created for it.
634   void sinkScalarOperands(Instruction *PredInst);
635 
636   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
637   /// represented as.
638   void truncateToMinimalBitwidths();
639 
640   /// Create a broadcast instruction. This method generates a broadcast
641   /// instruction (shuffle) for loop invariant values and for the induction
642   /// value. If this is the induction variable then we extend it to N, N+1, ...
643   /// this is needed because each iteration in the loop corresponds to a SIMD
644   /// element.
645   virtual Value *getBroadcastInstrs(Value *V);
646 
647   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
648   /// to each vector element of Val. The sequence starts at StartIndex.
649   /// \p Opcode is relevant for FP induction variable.
650   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
651                                Instruction::BinaryOps Opcode =
652                                Instruction::BinaryOpsEnd);
653 
654   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
655   /// variable on which to base the steps, \p Step is the size of the step, and
656   /// \p EntryVal is the value from the original loop that maps to the steps.
657   /// Note that \p EntryVal doesn't have to be an induction variable - it
658   /// can also be a truncate instruction.
659   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
660                         const InductionDescriptor &ID);
661 
662   /// Create a vector induction phi node based on an existing scalar one. \p
663   /// EntryVal is the value from the original loop that maps to the vector phi
664   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
665   /// truncate instruction, instead of widening the original IV, we widen a
666   /// version of the IV truncated to \p EntryVal's type.
667   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
668                                        Value *Step, Instruction *EntryVal);
669 
670   /// Returns true if an instruction \p I should be scalarized instead of
671   /// vectorized for the chosen vectorization factor.
672   bool shouldScalarizeInstruction(Instruction *I) const;
673 
674   /// Returns true if we should generate a scalar version of \p IV.
675   bool needsScalarInduction(Instruction *IV) const;
676 
677   /// If there is a cast involved in the induction variable \p ID, which should
678   /// be ignored in the vectorized loop body, this function records the
679   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
680   /// cast. We had already proved that the casted Phi is equal to the uncasted
681   /// Phi in the vectorized loop (under a runtime guard), and therefore
682   /// there is no need to vectorize the cast - the same value can be used in the
683   /// vector loop for both the Phi and the cast.
684   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
685   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
686   ///
687   /// \p EntryVal is the value from the original loop that maps to the vector
688   /// phi node and is used to distinguish what is the IV currently being
689   /// processed - original one (if \p EntryVal is a phi corresponding to the
690   /// original IV) or the "newly-created" one based on the proof mentioned above
691   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
692   /// latter case \p EntryVal is a TruncInst and we must not record anything for
693   /// that IV, but it's error-prone to expect callers of this routine to care
694   /// about that, hence this explicit parameter.
695   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
696                                              const Instruction *EntryVal,
697                                              Value *VectorLoopValue,
698                                              unsigned Part,
699                                              unsigned Lane = UINT_MAX);
700 
701   /// Generate a shuffle sequence that will reverse the vector Vec.
702   virtual Value *reverseVector(Value *Vec);
703 
704   /// Returns (and creates if needed) the original loop trip count.
705   Value *getOrCreateTripCount(Loop *NewLoop);
706 
707   /// Returns (and creates if needed) the trip count of the widened loop.
708   Value *getOrCreateVectorTripCount(Loop *NewLoop);
709 
710   /// Returns a bitcasted value to the requested vector type.
711   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
712   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
713                                 const DataLayout &DL);
714 
715   /// Emit a bypass check to see if the vector trip count is zero, including if
716   /// it overflows.
717   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
718 
719   /// Emit a bypass check to see if all of the SCEV assumptions we've
720   /// had to make are correct.
721   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
722 
723   /// Emit bypass checks to check any memory assumptions we may have made.
724   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
725 
726   /// Compute the transformed value of Index at offset StartValue using step
727   /// StepValue.
728   /// For integer induction, returns StartValue + Index * StepValue.
729   /// For pointer induction, returns StartValue[Index * StepValue].
730   /// FIXME: The newly created binary instructions should contain nsw/nuw
731   /// flags, which can be found from the original scalar operations.
732   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
733                               const DataLayout &DL,
734                               const InductionDescriptor &ID) const;
735 
736   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
737   /// vector loop preheader, middle block and scalar preheader. Also
738   /// allocate a loop object for the new vector loop and return it.
739   Loop *createVectorLoopSkeleton(StringRef Prefix);
740 
741   /// Create new phi nodes for the induction variables to resume iteration count
742   /// in the scalar epilogue, from where the vectorized loop left off (given by
743   /// \p VectorTripCount).
744   /// In cases where the loop skeleton is more complicated (eg. epilogue
745   /// vectorization) and the resume values can come from an additional bypass
746   /// block, the \p AdditionalBypass pair provides information about the bypass
747   /// block and the end value on the edge from bypass to this loop.
748   void createInductionResumeValues(
749       Loop *L, Value *VectorTripCount,
750       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
751 
752   /// Complete the loop skeleton by adding debug MDs, creating appropriate
753   /// conditional branches in the middle block, preparing the builder and
754   /// running the verifier. Take in the vector loop \p L as argument, and return
755   /// the preheader of the completed vector loop.
756   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
757 
758   /// Add additional metadata to \p To that was not present on \p Orig.
759   ///
760   /// Currently this is used to add the noalias annotations based on the
761   /// inserted memchecks.  Use this for instructions that are *cloned* into the
762   /// vector loop.
763   void addNewMetadata(Instruction *To, const Instruction *Orig);
764 
765   /// Add metadata from one instruction to another.
766   ///
767   /// This includes both the original MDs from \p From and additional ones (\see
768   /// addNewMetadata).  Use this for *newly created* instructions in the vector
769   /// loop.
770   void addMetadata(Instruction *To, Instruction *From);
771 
772   /// Similar to the previous function but it adds the metadata to a
773   /// vector of instructions.
774   void addMetadata(ArrayRef<Value *> To, Instruction *From);
775 
776   /// Allow subclasses to override and print debug traces before/after vplan
777   /// execution, when trace information is requested.
778   virtual void printDebugTracesAtStart(){};
779   virtual void printDebugTracesAtEnd(){};
780 
781   /// The original loop.
782   Loop *OrigLoop;
783 
784   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
785   /// dynamic knowledge to simplify SCEV expressions and converts them to a
786   /// more usable form.
787   PredicatedScalarEvolution &PSE;
788 
789   /// Loop Info.
790   LoopInfo *LI;
791 
792   /// Dominator Tree.
793   DominatorTree *DT;
794 
795   /// Alias Analysis.
796   AAResults *AA;
797 
798   /// Target Library Info.
799   const TargetLibraryInfo *TLI;
800 
801   /// Target Transform Info.
802   const TargetTransformInfo *TTI;
803 
804   /// Assumption Cache.
805   AssumptionCache *AC;
806 
807   /// Interface to emit optimization remarks.
808   OptimizationRemarkEmitter *ORE;
809 
810   /// LoopVersioning.  It's only set up (non-null) if memchecks were
811   /// used.
812   ///
813   /// This is currently only used to add no-alias metadata based on the
814   /// memchecks.  The actually versioning is performed manually.
815   std::unique_ptr<LoopVersioning> LVer;
816 
817   /// The vectorization SIMD factor to use. Each vector will have this many
818   /// vector elements.
819   ElementCount VF;
820 
821   /// The vectorization unroll factor to use. Each scalar is vectorized to this
822   /// many different vector instructions.
823   unsigned UF;
824 
825   /// The builder that we use
826   IRBuilder<> Builder;
827 
828   // --- Vectorization state ---
829 
830   /// The vector-loop preheader.
831   BasicBlock *LoopVectorPreHeader;
832 
833   /// The scalar-loop preheader.
834   BasicBlock *LoopScalarPreHeader;
835 
836   /// Middle Block between the vector and the scalar.
837   BasicBlock *LoopMiddleBlock;
838 
839   /// The ExitBlock of the scalar loop.
840   BasicBlock *LoopExitBlock;
841 
842   /// The vector loop body.
843   BasicBlock *LoopVectorBody;
844 
845   /// The scalar loop body.
846   BasicBlock *LoopScalarBody;
847 
848   /// A list of all bypass blocks. The first block is the entry of the loop.
849   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
850 
851   /// The new Induction variable which was added to the new block.
852   PHINode *Induction = nullptr;
853 
854   /// The induction variable of the old basic block.
855   PHINode *OldInduction = nullptr;
856 
857   /// Maps values from the original loop to their corresponding values in the
858   /// vectorized loop. A key value can map to either vector values, scalar
859   /// values or both kinds of values, depending on whether the key was
860   /// vectorized and scalarized.
861   VectorizerValueMap VectorLoopValueMap;
862 
863   /// Store instructions that were predicated.
864   SmallVector<Instruction *, 4> PredicatedInstructions;
865 
866   /// Trip count of the original loop.
867   Value *TripCount = nullptr;
868 
869   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
870   Value *VectorTripCount = nullptr;
871 
872   /// The legality analysis.
873   LoopVectorizationLegality *Legal;
874 
875   /// The profitablity analysis.
876   LoopVectorizationCostModel *Cost;
877 
878   // Record whether runtime checks are added.
879   bool AddedSafetyChecks = false;
880 
881   // Holds the end values for each induction variable. We save the end values
882   // so we can later fix-up the external users of the induction variables.
883   DenseMap<PHINode *, Value *> IVEndValues;
884 
885   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
886   // fixed up at the end of vector code generation.
887   SmallVector<PHINode *, 8> OrigPHIsToFix;
888 
889   /// BFI and PSI are used to check for profile guided size optimizations.
890   BlockFrequencyInfo *BFI;
891   ProfileSummaryInfo *PSI;
892 
893   // Whether this loop should be optimized for size based on profile guided size
894   // optimizatios.
895   bool OptForSizeBasedOnProfile;
896 };
897 
898 class InnerLoopUnroller : public InnerLoopVectorizer {
899 public:
900   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
901                     LoopInfo *LI, DominatorTree *DT,
902                     const TargetLibraryInfo *TLI,
903                     const TargetTransformInfo *TTI, AssumptionCache *AC,
904                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
905                     LoopVectorizationLegality *LVL,
906                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
907                     ProfileSummaryInfo *PSI)
908       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
909                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
910                             BFI, PSI) {}
911 
912 private:
913   Value *getBroadcastInstrs(Value *V) override;
914   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
915                        Instruction::BinaryOps Opcode =
916                        Instruction::BinaryOpsEnd) override;
917   Value *reverseVector(Value *Vec) override;
918 };
919 
920 /// Encapsulate information regarding vectorization of a loop and its epilogue.
921 /// This information is meant to be updated and used across two stages of
922 /// epilogue vectorization.
923 struct EpilogueLoopVectorizationInfo {
924   ElementCount MainLoopVF = ElementCount::getFixed(0);
925   unsigned MainLoopUF = 0;
926   ElementCount EpilogueVF = ElementCount::getFixed(0);
927   unsigned EpilogueUF = 0;
928   BasicBlock *MainLoopIterationCountCheck = nullptr;
929   BasicBlock *EpilogueIterationCountCheck = nullptr;
930   BasicBlock *SCEVSafetyCheck = nullptr;
931   BasicBlock *MemSafetyCheck = nullptr;
932   Value *TripCount = nullptr;
933   Value *VectorTripCount = nullptr;
934 
935   EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
936                                 unsigned EUF)
937       : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
938         EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
939     assert(EUF == 1 &&
940            "A high UF for the epilogue loop is likely not beneficial.");
941   }
942 };
943 
944 /// An extension of the inner loop vectorizer that creates a skeleton for a
945 /// vectorized loop that has its epilogue (residual) also vectorized.
946 /// The idea is to run the vplan on a given loop twice, firstly to setup the
947 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
948 /// from the first step and vectorize the epilogue.  This is achieved by
949 /// deriving two concrete strategy classes from this base class and invoking
950 /// them in succession from the loop vectorizer planner.
951 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
952 public:
953   InnerLoopAndEpilogueVectorizer(
954       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
955       DominatorTree *DT, const TargetLibraryInfo *TLI,
956       const TargetTransformInfo *TTI, AssumptionCache *AC,
957       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
958       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
959       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
960       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
961                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI),
962         EPI(EPI) {}
963 
964   // Override this function to handle the more complex control flow around the
965   // three loops.
966   BasicBlock *createVectorizedLoopSkeleton() final override {
967     return createEpilogueVectorizedLoopSkeleton();
968   }
969 
970   /// The interface for creating a vectorized skeleton using one of two
971   /// different strategies, each corresponding to one execution of the vplan
972   /// as described above.
973   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
974 
975   /// Holds and updates state information required to vectorize the main loop
976   /// and its epilogue in two separate passes. This setup helps us avoid
977   /// regenerating and recomputing runtime safety checks. It also helps us to
978   /// shorten the iteration-count-check path length for the cases where the
979   /// iteration count of the loop is so small that the main vector loop is
980   /// completely skipped.
981   EpilogueLoopVectorizationInfo &EPI;
982 };
983 
984 /// A specialized derived class of inner loop vectorizer that performs
985 /// vectorization of *main* loops in the process of vectorizing loops and their
986 /// epilogues.
987 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
988 public:
989   EpilogueVectorizerMainLoop(
990       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
991       DominatorTree *DT, const TargetLibraryInfo *TLI,
992       const TargetTransformInfo *TTI, AssumptionCache *AC,
993       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
994       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
995       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
996       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
997                                        EPI, LVL, CM, BFI, PSI) {}
998   /// Implements the interface for creating a vectorized skeleton using the
999   /// *main loop* strategy (ie the first pass of vplan execution).
1000   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1001 
1002 protected:
1003   /// Emits an iteration count bypass check once for the main loop (when \p
1004   /// ForEpilogue is false) and once for the epilogue loop (when \p
1005   /// ForEpilogue is true).
1006   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
1007                                              bool ForEpilogue);
1008   void printDebugTracesAtStart() override;
1009   void printDebugTracesAtEnd() override;
1010 };
1011 
1012 // A specialized derived class of inner loop vectorizer that performs
1013 // vectorization of *epilogue* loops in the process of vectorizing loops and
1014 // their epilogues.
1015 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
1016 public:
1017   EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
1018                     LoopInfo *LI, DominatorTree *DT,
1019                     const TargetLibraryInfo *TLI,
1020                     const TargetTransformInfo *TTI, AssumptionCache *AC,
1021                     OptimizationRemarkEmitter *ORE,
1022                     EpilogueLoopVectorizationInfo &EPI,
1023                     LoopVectorizationLegality *LVL,
1024                     llvm::LoopVectorizationCostModel *CM,
1025                     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
1026       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1027                                        EPI, LVL, CM, BFI, PSI) {}
1028   /// Implements the interface for creating a vectorized skeleton using the
1029   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1030   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1031 
1032 protected:
1033   /// Emits an iteration count bypass check after the main vector loop has
1034   /// finished to see if there are any iterations left to execute by either
1035   /// the vector epilogue or the scalar epilogue.
1036   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1037                                                       BasicBlock *Bypass,
1038                                                       BasicBlock *Insert);
1039   void printDebugTracesAtStart() override;
1040   void printDebugTracesAtEnd() override;
1041 };
1042 } // end namespace llvm
1043 
1044 /// Look for a meaningful debug location on the instruction or it's
1045 /// operands.
1046 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1047   if (!I)
1048     return I;
1049 
1050   DebugLoc Empty;
1051   if (I->getDebugLoc() != Empty)
1052     return I;
1053 
1054   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
1055     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
1056       if (OpInst->getDebugLoc() != Empty)
1057         return OpInst;
1058   }
1059 
1060   return I;
1061 }
1062 
1063 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1064   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1065     const DILocation *DIL = Inst->getDebugLoc();
1066     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1067         !isa<DbgInfoIntrinsic>(Inst)) {
1068       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1069       auto NewDIL =
1070           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1071       if (NewDIL)
1072         B.SetCurrentDebugLocation(NewDIL.getValue());
1073       else
1074         LLVM_DEBUG(dbgs()
1075                    << "Failed to create new discriminator: "
1076                    << DIL->getFilename() << " Line: " << DIL->getLine());
1077     }
1078     else
1079       B.SetCurrentDebugLocation(DIL);
1080   } else
1081     B.SetCurrentDebugLocation(DebugLoc());
1082 }
1083 
1084 /// Write a record \p DebugMsg about vectorization failure to the debug
1085 /// output stream. If \p I is passed, it is an instruction that prevents
1086 /// vectorization.
1087 #ifndef NDEBUG
1088 static void debugVectorizationFailure(const StringRef DebugMsg,
1089     Instruction *I) {
1090   dbgs() << "LV: Not vectorizing: " << DebugMsg;
1091   if (I != nullptr)
1092     dbgs() << " " << *I;
1093   else
1094     dbgs() << '.';
1095   dbgs() << '\n';
1096 }
1097 #endif
1098 
1099 /// Create an analysis remark that explains why vectorization failed
1100 ///
1101 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1102 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1103 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1104 /// the location of the remark.  \return the remark object that can be
1105 /// streamed to.
1106 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1107     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1108   Value *CodeRegion = TheLoop->getHeader();
1109   DebugLoc DL = TheLoop->getStartLoc();
1110 
1111   if (I) {
1112     CodeRegion = I->getParent();
1113     // If there is no debug location attached to the instruction, revert back to
1114     // using the loop's.
1115     if (I->getDebugLoc())
1116       DL = I->getDebugLoc();
1117   }
1118 
1119   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1120   R << "loop not vectorized: ";
1121   return R;
1122 }
1123 
1124 /// Return a value for Step multiplied by VF.
1125 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1126   assert(isa<ConstantInt>(Step) && "Expected an integer step");
1127   Constant *StepVal = ConstantInt::get(
1128       Step->getType(),
1129       cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1130   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1131 }
1132 
1133 namespace llvm {
1134 
1135 void reportVectorizationFailure(const StringRef DebugMsg,
1136     const StringRef OREMsg, const StringRef ORETag,
1137     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1138   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
1139   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1140   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1141                 ORETag, TheLoop, I) << OREMsg);
1142 }
1143 
1144 } // end namespace llvm
1145 
1146 #ifndef NDEBUG
1147 /// \return string containing a file name and a line # for the given loop.
1148 static std::string getDebugLocString(const Loop *L) {
1149   std::string Result;
1150   if (L) {
1151     raw_string_ostream OS(Result);
1152     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1153       LoopDbgLoc.print(OS);
1154     else
1155       // Just print the module name.
1156       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1157     OS.flush();
1158   }
1159   return Result;
1160 }
1161 #endif
1162 
1163 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1164                                          const Instruction *Orig) {
1165   // If the loop was versioned with memchecks, add the corresponding no-alias
1166   // metadata.
1167   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1168     LVer->annotateInstWithNoAlias(To, Orig);
1169 }
1170 
1171 void InnerLoopVectorizer::addMetadata(Instruction *To,
1172                                       Instruction *From) {
1173   propagateMetadata(To, From);
1174   addNewMetadata(To, From);
1175 }
1176 
1177 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1178                                       Instruction *From) {
1179   for (Value *V : To) {
1180     if (Instruction *I = dyn_cast<Instruction>(V))
1181       addMetadata(I, From);
1182   }
1183 }
1184 
1185 namespace llvm {
1186 
1187 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1188 // lowered.
1189 enum ScalarEpilogueLowering {
1190 
1191   // The default: allowing scalar epilogues.
1192   CM_ScalarEpilogueAllowed,
1193 
1194   // Vectorization with OptForSize: don't allow epilogues.
1195   CM_ScalarEpilogueNotAllowedOptSize,
1196 
1197   // A special case of vectorisation with OptForSize: loops with a very small
1198   // trip count are considered for vectorization under OptForSize, thereby
1199   // making sure the cost of their loop body is dominant, free of runtime
1200   // guards and scalar iteration overheads.
1201   CM_ScalarEpilogueNotAllowedLowTripLoop,
1202 
1203   // Loop hint predicate indicating an epilogue is undesired.
1204   CM_ScalarEpilogueNotNeededUsePredicate,
1205 
1206   // Directive indicating we must either tail fold or not vectorize
1207   CM_ScalarEpilogueNotAllowedUsePredicate
1208 };
1209 
1210 /// LoopVectorizationCostModel - estimates the expected speedups due to
1211 /// vectorization.
1212 /// In many cases vectorization is not profitable. This can happen because of
1213 /// a number of reasons. In this class we mainly attempt to predict the
1214 /// expected speedup/slowdowns due to the supported instruction set. We use the
1215 /// TargetTransformInfo to query the different backends for the cost of
1216 /// different operations.
1217 class LoopVectorizationCostModel {
1218 public:
1219   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1220                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1221                              LoopVectorizationLegality *Legal,
1222                              const TargetTransformInfo &TTI,
1223                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1224                              AssumptionCache *AC,
1225                              OptimizationRemarkEmitter *ORE, const Function *F,
1226                              const LoopVectorizeHints *Hints,
1227                              InterleavedAccessInfo &IAI)
1228       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1229         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1230         Hints(Hints), InterleaveInfo(IAI) {}
1231 
1232   /// \return An upper bound for the vectorization factor, or None if
1233   /// vectorization and interleaving should be avoided up front.
1234   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1235 
1236   /// \return True if runtime checks are required for vectorization, and false
1237   /// otherwise.
1238   bool runtimeChecksRequired();
1239 
1240   /// \return The most profitable vectorization factor and the cost of that VF.
1241   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1242   /// then this vectorization factor will be selected if vectorization is
1243   /// possible.
1244   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1245   VectorizationFactor
1246   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1247                                     const LoopVectorizationPlanner &LVP);
1248 
1249   /// Setup cost-based decisions for user vectorization factor.
1250   void selectUserVectorizationFactor(ElementCount UserVF) {
1251     collectUniformsAndScalars(UserVF);
1252     collectInstsToScalarize(UserVF);
1253   }
1254 
1255   /// \return The size (in bits) of the smallest and widest types in the code
1256   /// that needs to be vectorized. We ignore values that remain scalar such as
1257   /// 64 bit loop indices.
1258   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1259 
1260   /// \return The desired interleave count.
1261   /// If interleave count has been specified by metadata it will be returned.
1262   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1263   /// are the selected vectorization factor and the cost of the selected VF.
1264   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1265 
1266   /// Memory access instruction may be vectorized in more than one way.
1267   /// Form of instruction after vectorization depends on cost.
1268   /// This function takes cost-based decisions for Load/Store instructions
1269   /// and collects them in a map. This decisions map is used for building
1270   /// the lists of loop-uniform and loop-scalar instructions.
1271   /// The calculated cost is saved with widening decision in order to
1272   /// avoid redundant calculations.
1273   void setCostBasedWideningDecision(ElementCount VF);
1274 
1275   /// A struct that represents some properties of the register usage
1276   /// of a loop.
1277   struct RegisterUsage {
1278     /// Holds the number of loop invariant values that are used in the loop.
1279     /// The key is ClassID of target-provided register class.
1280     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1281     /// Holds the maximum number of concurrent live intervals in the loop.
1282     /// The key is ClassID of target-provided register class.
1283     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1284   };
1285 
1286   /// \return Returns information about the register usages of the loop for the
1287   /// given vectorization factors.
1288   SmallVector<RegisterUsage, 8>
1289   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1290 
1291   /// Collect values we want to ignore in the cost model.
1292   void collectValuesToIgnore();
1293 
1294   /// Split reductions into those that happen in the loop, and those that happen
1295   /// outside. In loop reductions are collected into InLoopReductionChains.
1296   void collectInLoopReductions();
1297 
1298   /// \returns The smallest bitwidth each instruction can be represented with.
1299   /// The vector equivalents of these instructions should be truncated to this
1300   /// type.
1301   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1302     return MinBWs;
1303   }
1304 
1305   /// \returns True if it is more profitable to scalarize instruction \p I for
1306   /// vectorization factor \p VF.
1307   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1308     assert(VF.isVector() &&
1309            "Profitable to scalarize relevant only for VF > 1.");
1310 
1311     // Cost model is not run in the VPlan-native path - return conservative
1312     // result until this changes.
1313     if (EnableVPlanNativePath)
1314       return false;
1315 
1316     auto Scalars = InstsToScalarize.find(VF);
1317     assert(Scalars != InstsToScalarize.end() &&
1318            "VF not yet analyzed for scalarization profitability");
1319     return Scalars->second.find(I) != Scalars->second.end();
1320   }
1321 
1322   /// Returns true if \p I is known to be uniform after vectorization.
1323   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1324     if (VF.isScalar())
1325       return true;
1326 
1327     // Cost model is not run in the VPlan-native path - return conservative
1328     // result until this changes.
1329     if (EnableVPlanNativePath)
1330       return false;
1331 
1332     auto UniformsPerVF = Uniforms.find(VF);
1333     assert(UniformsPerVF != Uniforms.end() &&
1334            "VF not yet analyzed for uniformity");
1335     return UniformsPerVF->second.count(I);
1336   }
1337 
1338   /// Returns true if \p I is known to be scalar after vectorization.
1339   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1340     if (VF.isScalar())
1341       return true;
1342 
1343     // Cost model is not run in the VPlan-native path - return conservative
1344     // result until this changes.
1345     if (EnableVPlanNativePath)
1346       return false;
1347 
1348     auto ScalarsPerVF = Scalars.find(VF);
1349     assert(ScalarsPerVF != Scalars.end() &&
1350            "Scalar values are not calculated for VF");
1351     return ScalarsPerVF->second.count(I);
1352   }
1353 
1354   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1355   /// for vectorization factor \p VF.
1356   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1357     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1358            !isProfitableToScalarize(I, VF) &&
1359            !isScalarAfterVectorization(I, VF);
1360   }
1361 
1362   /// Decision that was taken during cost calculation for memory instruction.
1363   enum InstWidening {
1364     CM_Unknown,
1365     CM_Widen,         // For consecutive accesses with stride +1.
1366     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1367     CM_Interleave,
1368     CM_GatherScatter,
1369     CM_Scalarize
1370   };
1371 
1372   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1373   /// instruction \p I and vector width \p VF.
1374   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1375                            unsigned Cost) {
1376     assert(VF.isVector() && "Expected VF >=2");
1377     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1378   }
1379 
1380   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1381   /// interleaving group \p Grp and vector width \p VF.
1382   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1383                            ElementCount VF, InstWidening W, unsigned Cost) {
1384     assert(VF.isVector() && "Expected VF >=2");
1385     /// Broadcast this decicion to all instructions inside the group.
1386     /// But the cost will be assigned to one instruction only.
1387     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1388       if (auto *I = Grp->getMember(i)) {
1389         if (Grp->getInsertPos() == I)
1390           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1391         else
1392           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1393       }
1394     }
1395   }
1396 
1397   /// Return the cost model decision for the given instruction \p I and vector
1398   /// width \p VF. Return CM_Unknown if this instruction did not pass
1399   /// through the cost modeling.
1400   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1401     assert(VF.isVector() && "Expected VF to be a vector VF");
1402     // Cost model is not run in the VPlan-native path - return conservative
1403     // result until this changes.
1404     if (EnableVPlanNativePath)
1405       return CM_GatherScatter;
1406 
1407     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1408     auto Itr = WideningDecisions.find(InstOnVF);
1409     if (Itr == WideningDecisions.end())
1410       return CM_Unknown;
1411     return Itr->second.first;
1412   }
1413 
1414   /// Return the vectorization cost for the given instruction \p I and vector
1415   /// width \p VF.
1416   unsigned getWideningCost(Instruction *I, ElementCount VF) {
1417     assert(VF.isVector() && "Expected VF >=2");
1418     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1419     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1420            "The cost is not calculated");
1421     return WideningDecisions[InstOnVF].second;
1422   }
1423 
1424   /// Return True if instruction \p I is an optimizable truncate whose operand
1425   /// is an induction variable. Such a truncate will be removed by adding a new
1426   /// induction variable with the destination type.
1427   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1428     // If the instruction is not a truncate, return false.
1429     auto *Trunc = dyn_cast<TruncInst>(I);
1430     if (!Trunc)
1431       return false;
1432 
1433     // Get the source and destination types of the truncate.
1434     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1435     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1436 
1437     // If the truncate is free for the given types, return false. Replacing a
1438     // free truncate with an induction variable would add an induction variable
1439     // update instruction to each iteration of the loop. We exclude from this
1440     // check the primary induction variable since it will need an update
1441     // instruction regardless.
1442     Value *Op = Trunc->getOperand(0);
1443     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1444       return false;
1445 
1446     // If the truncated value is not an induction variable, return false.
1447     return Legal->isInductionPhi(Op);
1448   }
1449 
1450   /// Collects the instructions to scalarize for each predicated instruction in
1451   /// the loop.
1452   void collectInstsToScalarize(ElementCount VF);
1453 
1454   /// Collect Uniform and Scalar values for the given \p VF.
1455   /// The sets depend on CM decision for Load/Store instructions
1456   /// that may be vectorized as interleave, gather-scatter or scalarized.
1457   void collectUniformsAndScalars(ElementCount VF) {
1458     // Do the analysis once.
1459     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1460       return;
1461     setCostBasedWideningDecision(VF);
1462     collectLoopUniforms(VF);
1463     collectLoopScalars(VF);
1464   }
1465 
1466   /// Returns true if the target machine supports masked store operation
1467   /// for the given \p DataType and kind of access to \p Ptr.
1468   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1469     return Legal->isConsecutivePtr(Ptr) &&
1470            TTI.isLegalMaskedStore(DataType, Alignment);
1471   }
1472 
1473   /// Returns true if the target machine supports masked load operation
1474   /// for the given \p DataType and kind of access to \p Ptr.
1475   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1476     return Legal->isConsecutivePtr(Ptr) &&
1477            TTI.isLegalMaskedLoad(DataType, Alignment);
1478   }
1479 
1480   /// Returns true if the target machine supports masked scatter operation
1481   /// for the given \p DataType.
1482   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1483     return TTI.isLegalMaskedScatter(DataType, Alignment);
1484   }
1485 
1486   /// Returns true if the target machine supports masked gather operation
1487   /// for the given \p DataType.
1488   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1489     return TTI.isLegalMaskedGather(DataType, Alignment);
1490   }
1491 
1492   /// Returns true if the target machine can represent \p V as a masked gather
1493   /// or scatter operation.
1494   bool isLegalGatherOrScatter(Value *V) {
1495     bool LI = isa<LoadInst>(V);
1496     bool SI = isa<StoreInst>(V);
1497     if (!LI && !SI)
1498       return false;
1499     auto *Ty = getMemInstValueType(V);
1500     Align Align = getLoadStoreAlignment(V);
1501     return (LI && isLegalMaskedGather(Ty, Align)) ||
1502            (SI && isLegalMaskedScatter(Ty, Align));
1503   }
1504 
1505   /// Returns true if \p I is an instruction that will be scalarized with
1506   /// predication. Such instructions include conditional stores and
1507   /// instructions that may divide by zero.
1508   /// If a non-zero VF has been calculated, we check if I will be scalarized
1509   /// predication for that VF.
1510   bool isScalarWithPredication(Instruction *I,
1511                                ElementCount VF = ElementCount::getFixed(1));
1512 
1513   // Returns true if \p I is an instruction that will be predicated either
1514   // through scalar predication or masked load/store or masked gather/scatter.
1515   // Superset of instructions that return true for isScalarWithPredication.
1516   bool isPredicatedInst(Instruction *I) {
1517     if (!blockNeedsPredication(I->getParent()))
1518       return false;
1519     // Loads and stores that need some form of masked operation are predicated
1520     // instructions.
1521     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1522       return Legal->isMaskRequired(I);
1523     return isScalarWithPredication(I);
1524   }
1525 
1526   /// Returns true if \p I is a memory instruction with consecutive memory
1527   /// access that can be widened.
1528   bool
1529   memoryInstructionCanBeWidened(Instruction *I,
1530                                 ElementCount VF = ElementCount::getFixed(1));
1531 
1532   /// Returns true if \p I is a memory instruction in an interleaved-group
1533   /// of memory accesses that can be vectorized with wide vector loads/stores
1534   /// and shuffles.
1535   bool
1536   interleavedAccessCanBeWidened(Instruction *I,
1537                                 ElementCount VF = ElementCount::getFixed(1));
1538 
1539   /// Check if \p Instr belongs to any interleaved access group.
1540   bool isAccessInterleaved(Instruction *Instr) {
1541     return InterleaveInfo.isInterleaved(Instr);
1542   }
1543 
1544   /// Get the interleaved access group that \p Instr belongs to.
1545   const InterleaveGroup<Instruction> *
1546   getInterleavedAccessGroup(Instruction *Instr) {
1547     return InterleaveInfo.getInterleaveGroup(Instr);
1548   }
1549 
1550   /// Returns true if an interleaved group requires a scalar iteration
1551   /// to handle accesses with gaps, and there is nothing preventing us from
1552   /// creating a scalar epilogue.
1553   bool requiresScalarEpilogue() const {
1554     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1555   }
1556 
1557   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1558   /// loop hint annotation.
1559   bool isScalarEpilogueAllowed() const {
1560     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1561   }
1562 
1563   /// Returns true if all loop blocks should be masked to fold tail loop.
1564   bool foldTailByMasking() const { return FoldTailByMasking; }
1565 
1566   bool blockNeedsPredication(BasicBlock *BB) {
1567     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1568   }
1569 
1570   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1571   /// nodes to the chain of instructions representing the reductions. Uses a
1572   /// MapVector to ensure deterministic iteration order.
1573   using ReductionChainMap =
1574       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1575 
1576   /// Return the chain of instructions representing an inloop reduction.
1577   const ReductionChainMap &getInLoopReductionChains() const {
1578     return InLoopReductionChains;
1579   }
1580 
1581   /// Returns true if the Phi is part of an inloop reduction.
1582   bool isInLoopReduction(PHINode *Phi) const {
1583     return InLoopReductionChains.count(Phi);
1584   }
1585 
1586   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1587   /// with factor VF.  Return the cost of the instruction, including
1588   /// scalarization overhead if it's needed.
1589   unsigned getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1590 
1591   /// Estimate cost of a call instruction CI if it were vectorized with factor
1592   /// VF. Return the cost of the instruction, including scalarization overhead
1593   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1594   /// scalarized -
1595   /// i.e. either vector version isn't available, or is too expensive.
1596   unsigned getVectorCallCost(CallInst *CI, ElementCount VF,
1597                              bool &NeedToScalarize);
1598 
1599   /// Invalidates decisions already taken by the cost model.
1600   void invalidateCostModelingDecisions() {
1601     WideningDecisions.clear();
1602     Uniforms.clear();
1603     Scalars.clear();
1604   }
1605 
1606 private:
1607   unsigned NumPredStores = 0;
1608 
1609   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1610   /// than zero. One is returned if vectorization should best be avoided due
1611   /// to cost.
1612   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1613                                     ElementCount UserVF);
1614 
1615   /// The vectorization cost is a combination of the cost itself and a boolean
1616   /// indicating whether any of the contributing operations will actually
1617   /// operate on
1618   /// vector values after type legalization in the backend. If this latter value
1619   /// is
1620   /// false, then all operations will be scalarized (i.e. no vectorization has
1621   /// actually taken place).
1622   using VectorizationCostTy = std::pair<unsigned, bool>;
1623 
1624   /// Returns the expected execution cost. The unit of the cost does
1625   /// not matter because we use the 'cost' units to compare different
1626   /// vector widths. The cost that is returned is *not* normalized by
1627   /// the factor width.
1628   VectorizationCostTy expectedCost(ElementCount VF);
1629 
1630   /// Returns the execution time cost of an instruction for a given vector
1631   /// width. Vector width of one means scalar.
1632   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1633 
1634   /// The cost-computation logic from getInstructionCost which provides
1635   /// the vector type as an output parameter.
1636   unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy);
1637 
1638   /// Calculate vectorization cost of memory instruction \p I.
1639   unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF);
1640 
1641   /// The cost computation for scalarized memory instruction.
1642   unsigned getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1643 
1644   /// The cost computation for interleaving group of memory instructions.
1645   unsigned getInterleaveGroupCost(Instruction *I, ElementCount VF);
1646 
1647   /// The cost computation for Gather/Scatter instruction.
1648   unsigned getGatherScatterCost(Instruction *I, ElementCount VF);
1649 
1650   /// The cost computation for widening instruction \p I with consecutive
1651   /// memory access.
1652   unsigned getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1653 
1654   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1655   /// Load: scalar load + broadcast.
1656   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1657   /// element)
1658   unsigned getUniformMemOpCost(Instruction *I, ElementCount VF);
1659 
1660   /// Estimate the overhead of scalarizing an instruction. This is a
1661   /// convenience wrapper for the type-based getScalarizationOverhead API.
1662   unsigned getScalarizationOverhead(Instruction *I, ElementCount VF);
1663 
1664   /// Returns whether the instruction is a load or store and will be a emitted
1665   /// as a vector operation.
1666   bool isConsecutiveLoadOrStore(Instruction *I);
1667 
1668   /// Returns true if an artificially high cost for emulated masked memrefs
1669   /// should be used.
1670   bool useEmulatedMaskMemRefHack(Instruction *I);
1671 
1672   /// Map of scalar integer values to the smallest bitwidth they can be legally
1673   /// represented as. The vector equivalents of these values should be truncated
1674   /// to this type.
1675   MapVector<Instruction *, uint64_t> MinBWs;
1676 
1677   /// A type representing the costs for instructions if they were to be
1678   /// scalarized rather than vectorized. The entries are Instruction-Cost
1679   /// pairs.
1680   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1681 
1682   /// A set containing all BasicBlocks that are known to present after
1683   /// vectorization as a predicated block.
1684   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1685 
1686   /// Records whether it is allowed to have the original scalar loop execute at
1687   /// least once. This may be needed as a fallback loop in case runtime
1688   /// aliasing/dependence checks fail, or to handle the tail/remainder
1689   /// iterations when the trip count is unknown or doesn't divide by the VF,
1690   /// or as a peel-loop to handle gaps in interleave-groups.
1691   /// Under optsize and when the trip count is very small we don't allow any
1692   /// iterations to execute in the scalar loop.
1693   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1694 
1695   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1696   bool FoldTailByMasking = false;
1697 
1698   /// A map holding scalar costs for different vectorization factors. The
1699   /// presence of a cost for an instruction in the mapping indicates that the
1700   /// instruction will be scalarized when vectorizing with the associated
1701   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1702   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1703 
1704   /// Holds the instructions known to be uniform after vectorization.
1705   /// The data is collected per VF.
1706   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1707 
1708   /// Holds the instructions known to be scalar after vectorization.
1709   /// The data is collected per VF.
1710   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1711 
1712   /// Holds the instructions (address computations) that are forced to be
1713   /// scalarized.
1714   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1715 
1716   /// PHINodes of the reductions that should be expanded in-loop along with
1717   /// their associated chains of reduction operations, in program order from top
1718   /// (PHI) to bottom
1719   ReductionChainMap InLoopReductionChains;
1720 
1721   /// Returns the expected difference in cost from scalarizing the expression
1722   /// feeding a predicated instruction \p PredInst. The instructions to
1723   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1724   /// non-negative return value implies the expression will be scalarized.
1725   /// Currently, only single-use chains are considered for scalarization.
1726   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1727                               ElementCount VF);
1728 
1729   /// Collect the instructions that are uniform after vectorization. An
1730   /// instruction is uniform if we represent it with a single scalar value in
1731   /// the vectorized loop corresponding to each vector iteration. Examples of
1732   /// uniform instructions include pointer operands of consecutive or
1733   /// interleaved memory accesses. Note that although uniformity implies an
1734   /// instruction will be scalar, the reverse is not true. In general, a
1735   /// scalarized instruction will be represented by VF scalar values in the
1736   /// vectorized loop, each corresponding to an iteration of the original
1737   /// scalar loop.
1738   void collectLoopUniforms(ElementCount VF);
1739 
1740   /// Collect the instructions that are scalar after vectorization. An
1741   /// instruction is scalar if it is known to be uniform or will be scalarized
1742   /// during vectorization. Non-uniform scalarized instructions will be
1743   /// represented by VF values in the vectorized loop, each corresponding to an
1744   /// iteration of the original scalar loop.
1745   void collectLoopScalars(ElementCount VF);
1746 
1747   /// Keeps cost model vectorization decision and cost for instructions.
1748   /// Right now it is used for memory instructions only.
1749   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1750                                 std::pair<InstWidening, unsigned>>;
1751 
1752   DecisionList WideningDecisions;
1753 
1754   /// Returns true if \p V is expected to be vectorized and it needs to be
1755   /// extracted.
1756   bool needsExtract(Value *V, ElementCount VF) const {
1757     Instruction *I = dyn_cast<Instruction>(V);
1758     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1759         TheLoop->isLoopInvariant(I))
1760       return false;
1761 
1762     // Assume we can vectorize V (and hence we need extraction) if the
1763     // scalars are not computed yet. This can happen, because it is called
1764     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1765     // the scalars are collected. That should be a safe assumption in most
1766     // cases, because we check if the operands have vectorizable types
1767     // beforehand in LoopVectorizationLegality.
1768     return Scalars.find(VF) == Scalars.end() ||
1769            !isScalarAfterVectorization(I, VF);
1770   };
1771 
1772   /// Returns a range containing only operands needing to be extracted.
1773   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1774                                                    ElementCount VF) {
1775     return SmallVector<Value *, 4>(make_filter_range(
1776         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1777   }
1778 
1779   /// Determines if we have the infrastructure to vectorize loop \p L and its
1780   /// epilogue, assuming the main loop is vectorized by \p VF.
1781   bool isCandidateForEpilogueVectorization(const Loop &L,
1782                                            const ElementCount VF) const;
1783 
1784   /// Returns true if epilogue vectorization is considered profitable, and
1785   /// false otherwise.
1786   /// \p VF is the vectorization factor chosen for the original loop.
1787   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1788 
1789 public:
1790   /// The loop that we evaluate.
1791   Loop *TheLoop;
1792 
1793   /// Predicated scalar evolution analysis.
1794   PredicatedScalarEvolution &PSE;
1795 
1796   /// Loop Info analysis.
1797   LoopInfo *LI;
1798 
1799   /// Vectorization legality.
1800   LoopVectorizationLegality *Legal;
1801 
1802   /// Vector target information.
1803   const TargetTransformInfo &TTI;
1804 
1805   /// Target Library Info.
1806   const TargetLibraryInfo *TLI;
1807 
1808   /// Demanded bits analysis.
1809   DemandedBits *DB;
1810 
1811   /// Assumption cache.
1812   AssumptionCache *AC;
1813 
1814   /// Interface to emit optimization remarks.
1815   OptimizationRemarkEmitter *ORE;
1816 
1817   const Function *TheFunction;
1818 
1819   /// Loop Vectorize Hint.
1820   const LoopVectorizeHints *Hints;
1821 
1822   /// The interleave access information contains groups of interleaved accesses
1823   /// with the same stride and close to each other.
1824   InterleavedAccessInfo &InterleaveInfo;
1825 
1826   /// Values to ignore in the cost model.
1827   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1828 
1829   /// Values to ignore in the cost model when VF > 1.
1830   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1831 
1832   /// Profitable vector factors.
1833   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1834 };
1835 
1836 } // end namespace llvm
1837 
1838 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1839 // vectorization. The loop needs to be annotated with #pragma omp simd
1840 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1841 // vector length information is not provided, vectorization is not considered
1842 // explicit. Interleave hints are not allowed either. These limitations will be
1843 // relaxed in the future.
1844 // Please, note that we are currently forced to abuse the pragma 'clang
1845 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1846 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1847 // provides *explicit vectorization hints* (LV can bypass legal checks and
1848 // assume that vectorization is legal). However, both hints are implemented
1849 // using the same metadata (llvm.loop.vectorize, processed by
1850 // LoopVectorizeHints). This will be fixed in the future when the native IR
1851 // representation for pragma 'omp simd' is introduced.
1852 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1853                                    OptimizationRemarkEmitter *ORE) {
1854   assert(!OuterLp->isInnermost() && "This is not an outer loop");
1855   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1856 
1857   // Only outer loops with an explicit vectorization hint are supported.
1858   // Unannotated outer loops are ignored.
1859   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1860     return false;
1861 
1862   Function *Fn = OuterLp->getHeader()->getParent();
1863   if (!Hints.allowVectorization(Fn, OuterLp,
1864                                 true /*VectorizeOnlyWhenForced*/)) {
1865     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1866     return false;
1867   }
1868 
1869   if (Hints.getInterleave() > 1) {
1870     // TODO: Interleave support is future work.
1871     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1872                          "outer loops.\n");
1873     Hints.emitRemarkWithHints();
1874     return false;
1875   }
1876 
1877   return true;
1878 }
1879 
1880 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1881                                   OptimizationRemarkEmitter *ORE,
1882                                   SmallVectorImpl<Loop *> &V) {
1883   // Collect inner loops and outer loops without irreducible control flow. For
1884   // now, only collect outer loops that have explicit vectorization hints. If we
1885   // are stress testing the VPlan H-CFG construction, we collect the outermost
1886   // loop of every loop nest.
1887   if (L.isInnermost() || VPlanBuildStressTest ||
1888       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1889     LoopBlocksRPO RPOT(&L);
1890     RPOT.perform(LI);
1891     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1892       V.push_back(&L);
1893       // TODO: Collect inner loops inside marked outer loops in case
1894       // vectorization fails for the outer loop. Do not invoke
1895       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1896       // already known to be reducible. We can use an inherited attribute for
1897       // that.
1898       return;
1899     }
1900   }
1901   for (Loop *InnerL : L)
1902     collectSupportedLoops(*InnerL, LI, ORE, V);
1903 }
1904 
1905 namespace {
1906 
1907 /// The LoopVectorize Pass.
1908 struct LoopVectorize : public FunctionPass {
1909   /// Pass identification, replacement for typeid
1910   static char ID;
1911 
1912   LoopVectorizePass Impl;
1913 
1914   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1915                          bool VectorizeOnlyWhenForced = false)
1916       : FunctionPass(ID),
1917         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
1918     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1919   }
1920 
1921   bool runOnFunction(Function &F) override {
1922     if (skipFunction(F))
1923       return false;
1924 
1925     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1926     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1927     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1928     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1929     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1930     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1931     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1932     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1933     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1934     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1935     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1936     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1937     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1938 
1939     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1940         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1941 
1942     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1943                         GetLAA, *ORE, PSI).MadeAnyChange;
1944   }
1945 
1946   void getAnalysisUsage(AnalysisUsage &AU) const override {
1947     AU.addRequired<AssumptionCacheTracker>();
1948     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1949     AU.addRequired<DominatorTreeWrapperPass>();
1950     AU.addRequired<LoopInfoWrapperPass>();
1951     AU.addRequired<ScalarEvolutionWrapperPass>();
1952     AU.addRequired<TargetTransformInfoWrapperPass>();
1953     AU.addRequired<AAResultsWrapperPass>();
1954     AU.addRequired<LoopAccessLegacyAnalysis>();
1955     AU.addRequired<DemandedBitsWrapperPass>();
1956     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1957     AU.addRequired<InjectTLIMappingsLegacy>();
1958 
1959     // We currently do not preserve loopinfo/dominator analyses with outer loop
1960     // vectorization. Until this is addressed, mark these analyses as preserved
1961     // only for non-VPlan-native path.
1962     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1963     if (!EnableVPlanNativePath) {
1964       AU.addPreserved<LoopInfoWrapperPass>();
1965       AU.addPreserved<DominatorTreeWrapperPass>();
1966     }
1967 
1968     AU.addPreserved<BasicAAWrapperPass>();
1969     AU.addPreserved<GlobalsAAWrapperPass>();
1970     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1971   }
1972 };
1973 
1974 } // end anonymous namespace
1975 
1976 //===----------------------------------------------------------------------===//
1977 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1978 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1979 //===----------------------------------------------------------------------===//
1980 
1981 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1982   // We need to place the broadcast of invariant variables outside the loop,
1983   // but only if it's proven safe to do so. Else, broadcast will be inside
1984   // vector loop body.
1985   Instruction *Instr = dyn_cast<Instruction>(V);
1986   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1987                      (!Instr ||
1988                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1989   // Place the code for broadcasting invariant variables in the new preheader.
1990   IRBuilder<>::InsertPointGuard Guard(Builder);
1991   if (SafeToHoist)
1992     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1993 
1994   // Broadcast the scalar into all locations in the vector.
1995   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1996 
1997   return Shuf;
1998 }
1999 
2000 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2001     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
2002   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2003          "Expected either an induction phi-node or a truncate of it!");
2004   Value *Start = II.getStartValue();
2005 
2006   // Construct the initial value of the vector IV in the vector loop preheader
2007   auto CurrIP = Builder.saveIP();
2008   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2009   if (isa<TruncInst>(EntryVal)) {
2010     assert(Start->getType()->isIntegerTy() &&
2011            "Truncation requires an integer type");
2012     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2013     Step = Builder.CreateTrunc(Step, TruncType);
2014     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2015   }
2016   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2017   Value *SteppedStart =
2018       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2019 
2020   // We create vector phi nodes for both integer and floating-point induction
2021   // variables. Here, we determine the kind of arithmetic we will perform.
2022   Instruction::BinaryOps AddOp;
2023   Instruction::BinaryOps MulOp;
2024   if (Step->getType()->isIntegerTy()) {
2025     AddOp = Instruction::Add;
2026     MulOp = Instruction::Mul;
2027   } else {
2028     AddOp = II.getInductionOpcode();
2029     MulOp = Instruction::FMul;
2030   }
2031 
2032   // Multiply the vectorization factor by the step using integer or
2033   // floating-point arithmetic as appropriate.
2034   Value *ConstVF =
2035       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
2036   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
2037 
2038   // Create a vector splat to use in the induction update.
2039   //
2040   // FIXME: If the step is non-constant, we create the vector splat with
2041   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2042   //        handle a constant vector splat.
2043   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2044   Value *SplatVF = isa<Constant>(Mul)
2045                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2046                        : Builder.CreateVectorSplat(VF, Mul);
2047   Builder.restoreIP(CurrIP);
2048 
2049   // We may need to add the step a number of times, depending on the unroll
2050   // factor. The last of those goes into the PHI.
2051   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2052                                     &*LoopVectorBody->getFirstInsertionPt());
2053   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2054   Instruction *LastInduction = VecInd;
2055   for (unsigned Part = 0; Part < UF; ++Part) {
2056     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
2057 
2058     if (isa<TruncInst>(EntryVal))
2059       addMetadata(LastInduction, EntryVal);
2060     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
2061 
2062     LastInduction = cast<Instruction>(addFastMathFlag(
2063         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
2064     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2065   }
2066 
2067   // Move the last step to the end of the latch block. This ensures consistent
2068   // placement of all induction updates.
2069   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2070   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2071   auto *ICmp = cast<Instruction>(Br->getCondition());
2072   LastInduction->moveBefore(ICmp);
2073   LastInduction->setName("vec.ind.next");
2074 
2075   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2076   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2077 }
2078 
2079 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2080   return Cost->isScalarAfterVectorization(I, VF) ||
2081          Cost->isProfitableToScalarize(I, VF);
2082 }
2083 
2084 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2085   if (shouldScalarizeInstruction(IV))
2086     return true;
2087   auto isScalarInst = [&](User *U) -> bool {
2088     auto *I = cast<Instruction>(U);
2089     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2090   };
2091   return llvm::any_of(IV->users(), isScalarInst);
2092 }
2093 
2094 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2095     const InductionDescriptor &ID, const Instruction *EntryVal,
2096     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
2097   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2098          "Expected either an induction phi-node or a truncate of it!");
2099 
2100   // This induction variable is not the phi from the original loop but the
2101   // newly-created IV based on the proof that casted Phi is equal to the
2102   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2103   // re-uses the same InductionDescriptor that original IV uses but we don't
2104   // have to do any recording in this case - that is done when original IV is
2105   // processed.
2106   if (isa<TruncInst>(EntryVal))
2107     return;
2108 
2109   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2110   if (Casts.empty())
2111     return;
2112   // Only the first Cast instruction in the Casts vector is of interest.
2113   // The rest of the Casts (if exist) have no uses outside the
2114   // induction update chain itself.
2115   Instruction *CastInst = *Casts.begin();
2116   if (Lane < UINT_MAX)
2117     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
2118   else
2119     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
2120 }
2121 
2122 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
2123   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2124          "Primary induction variable must have an integer type");
2125 
2126   auto II = Legal->getInductionVars().find(IV);
2127   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2128 
2129   auto ID = II->second;
2130   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2131 
2132   // The value from the original loop to which we are mapping the new induction
2133   // variable.
2134   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2135 
2136   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2137 
2138   // Generate code for the induction step. Note that induction steps are
2139   // required to be loop-invariant
2140   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2141     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2142            "Induction step should be loop invariant");
2143     if (PSE.getSE()->isSCEVable(IV->getType())) {
2144       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2145       return Exp.expandCodeFor(Step, Step->getType(),
2146                                LoopVectorPreHeader->getTerminator());
2147     }
2148     return cast<SCEVUnknown>(Step)->getValue();
2149   };
2150 
2151   // The scalar value to broadcast. This is derived from the canonical
2152   // induction variable. If a truncation type is given, truncate the canonical
2153   // induction variable and step. Otherwise, derive these values from the
2154   // induction descriptor.
2155   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2156     Value *ScalarIV = Induction;
2157     if (IV != OldInduction) {
2158       ScalarIV = IV->getType()->isIntegerTy()
2159                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2160                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2161                                           IV->getType());
2162       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2163       ScalarIV->setName("offset.idx");
2164     }
2165     if (Trunc) {
2166       auto *TruncType = cast<IntegerType>(Trunc->getType());
2167       assert(Step->getType()->isIntegerTy() &&
2168              "Truncation requires an integer step");
2169       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2170       Step = Builder.CreateTrunc(Step, TruncType);
2171     }
2172     return ScalarIV;
2173   };
2174 
2175   // Create the vector values from the scalar IV, in the absence of creating a
2176   // vector IV.
2177   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2178     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2179     for (unsigned Part = 0; Part < UF; ++Part) {
2180       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2181       Value *EntryPart =
2182           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2183                         ID.getInductionOpcode());
2184       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
2185       if (Trunc)
2186         addMetadata(EntryPart, Trunc);
2187       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
2188     }
2189   };
2190 
2191   // Now do the actual transformations, and start with creating the step value.
2192   Value *Step = CreateStepValue(ID.getStep());
2193   if (VF.isZero() || VF.isScalar()) {
2194     Value *ScalarIV = CreateScalarIV(Step);
2195     CreateSplatIV(ScalarIV, Step);
2196     return;
2197   }
2198 
2199   // Determine if we want a scalar version of the induction variable. This is
2200   // true if the induction variable itself is not widened, or if it has at
2201   // least one user in the loop that is not widened.
2202   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2203   if (!NeedsScalarIV) {
2204     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2205     return;
2206   }
2207 
2208   // Try to create a new independent vector induction variable. If we can't
2209   // create the phi node, we will splat the scalar induction variable in each
2210   // loop iteration.
2211   if (!shouldScalarizeInstruction(EntryVal)) {
2212     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
2213     Value *ScalarIV = CreateScalarIV(Step);
2214     // Create scalar steps that can be used by instructions we will later
2215     // scalarize. Note that the addition of the scalar steps will not increase
2216     // the number of instructions in the loop in the common case prior to
2217     // InstCombine. We will be trading one vector extract for each scalar step.
2218     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2219     return;
2220   }
2221 
2222   // All IV users are scalar instructions, so only emit a scalar IV, not a
2223   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2224   // predicate used by the masked loads/stores.
2225   Value *ScalarIV = CreateScalarIV(Step);
2226   if (!Cost->isScalarEpilogueAllowed())
2227     CreateSplatIV(ScalarIV, Step);
2228   buildScalarSteps(ScalarIV, Step, EntryVal, ID);
2229 }
2230 
2231 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2232                                           Instruction::BinaryOps BinOp) {
2233   // Create and check the types.
2234   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2235   int VLen = ValVTy->getNumElements();
2236 
2237   Type *STy = Val->getType()->getScalarType();
2238   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2239          "Induction Step must be an integer or FP");
2240   assert(Step->getType() == STy && "Step has wrong type");
2241 
2242   SmallVector<Constant *, 8> Indices;
2243 
2244   if (STy->isIntegerTy()) {
2245     // Create a vector of consecutive numbers from zero to VF.
2246     for (int i = 0; i < VLen; ++i)
2247       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2248 
2249     // Add the consecutive indices to the vector value.
2250     Constant *Cv = ConstantVector::get(Indices);
2251     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2252     Step = Builder.CreateVectorSplat(VLen, Step);
2253     assert(Step->getType() == Val->getType() && "Invalid step vec");
2254     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2255     // which can be found from the original scalar operations.
2256     Step = Builder.CreateMul(Cv, Step);
2257     return Builder.CreateAdd(Val, Step, "induction");
2258   }
2259 
2260   // Floating point induction.
2261   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2262          "Binary Opcode should be specified for FP induction");
2263   // Create a vector of consecutive numbers from zero to VF.
2264   for (int i = 0; i < VLen; ++i)
2265     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2266 
2267   // Add the consecutive indices to the vector value.
2268   Constant *Cv = ConstantVector::get(Indices);
2269 
2270   Step = Builder.CreateVectorSplat(VLen, Step);
2271 
2272   // Floating point operations had to be 'fast' to enable the induction.
2273   FastMathFlags Flags;
2274   Flags.setFast();
2275 
2276   Value *MulOp = Builder.CreateFMul(Cv, Step);
2277   if (isa<Instruction>(MulOp))
2278     // Have to check, MulOp may be a constant
2279     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
2280 
2281   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2282   if (isa<Instruction>(BOp))
2283     cast<Instruction>(BOp)->setFastMathFlags(Flags);
2284   return BOp;
2285 }
2286 
2287 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2288                                            Instruction *EntryVal,
2289                                            const InductionDescriptor &ID) {
2290   // We shouldn't have to build scalar steps if we aren't vectorizing.
2291   assert(VF.isVector() && "VF should be greater than one");
2292   // Get the value type and ensure it and the step have the same integer type.
2293   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2294   assert(ScalarIVTy == Step->getType() &&
2295          "Val and Step should have the same type");
2296 
2297   // We build scalar steps for both integer and floating-point induction
2298   // variables. Here, we determine the kind of arithmetic we will perform.
2299   Instruction::BinaryOps AddOp;
2300   Instruction::BinaryOps MulOp;
2301   if (ScalarIVTy->isIntegerTy()) {
2302     AddOp = Instruction::Add;
2303     MulOp = Instruction::Mul;
2304   } else {
2305     AddOp = ID.getInductionOpcode();
2306     MulOp = Instruction::FMul;
2307   }
2308 
2309   // Determine the number of scalars we need to generate for each unroll
2310   // iteration. If EntryVal is uniform, we only need to generate the first
2311   // lane. Otherwise, we generate all VF values.
2312   unsigned Lanes =
2313       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2314           ? 1
2315           : VF.getKnownMinValue();
2316   assert((!VF.isScalable() || Lanes == 1) &&
2317          "Should never scalarize a scalable vector");
2318   // Compute the scalar steps and save the results in VectorLoopValueMap.
2319   for (unsigned Part = 0; Part < UF; ++Part) {
2320     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2321       auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2322                                          ScalarIVTy->getScalarSizeInBits());
2323       Value *StartIdx =
2324           createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2325       if (ScalarIVTy->isFloatingPointTy())
2326         StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
2327       StartIdx = addFastMathFlag(Builder.CreateBinOp(
2328           AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
2329       // The step returned by `createStepForVF` is a runtime-evaluated value
2330       // when VF is scalable. Otherwise, it should be folded into a Constant.
2331       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2332              "Expected StartIdx to be folded to a constant when VF is not "
2333              "scalable");
2334       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
2335       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
2336       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
2337       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
2338     }
2339   }
2340 }
2341 
2342 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2343   assert(V != Induction && "The new induction variable should not be used.");
2344   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2345   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2346 
2347   // If we have a stride that is replaced by one, do it here. Defer this for
2348   // the VPlan-native path until we start running Legal checks in that path.
2349   if (!EnableVPlanNativePath && Legal->hasStride(V))
2350     V = ConstantInt::get(V->getType(), 1);
2351 
2352   // If we have a vector mapped to this value, return it.
2353   if (VectorLoopValueMap.hasVectorValue(V, Part))
2354     return VectorLoopValueMap.getVectorValue(V, Part);
2355 
2356   // If the value has not been vectorized, check if it has been scalarized
2357   // instead. If it has been scalarized, and we actually need the value in
2358   // vector form, we will construct the vector values on demand.
2359   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2360     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2361 
2362     // If we've scalarized a value, that value should be an instruction.
2363     auto *I = cast<Instruction>(V);
2364 
2365     // If we aren't vectorizing, we can just copy the scalar map values over to
2366     // the vector map.
2367     if (VF.isScalar()) {
2368       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2369       return ScalarValue;
2370     }
2371 
2372     // Get the last scalar instruction we generated for V and Part. If the value
2373     // is known to be uniform after vectorization, this corresponds to lane zero
2374     // of the Part unroll iteration. Otherwise, the last instruction is the one
2375     // we created for the last vector lane of the Part unroll iteration.
2376     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
2377                             ? 0
2378                             : VF.getKnownMinValue() - 1;
2379     assert((!VF.isScalable() || LastLane == 0) &&
2380            "Scalable vectorization can't lead to any scalarized values.");
2381     auto *LastInst = cast<Instruction>(
2382         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2383 
2384     // Set the insert point after the last scalarized instruction. This ensures
2385     // the insertelement sequence will directly follow the scalar definitions.
2386     auto OldIP = Builder.saveIP();
2387     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2388     Builder.SetInsertPoint(&*NewIP);
2389 
2390     // However, if we are vectorizing, we need to construct the vector values.
2391     // If the value is known to be uniform after vectorization, we can just
2392     // broadcast the scalar value corresponding to lane zero for each unroll
2393     // iteration. Otherwise, we construct the vector values using insertelement
2394     // instructions. Since the resulting vectors are stored in
2395     // VectorLoopValueMap, we will only generate the insertelements once.
2396     Value *VectorValue = nullptr;
2397     if (Cost->isUniformAfterVectorization(I, VF)) {
2398       VectorValue = getBroadcastInstrs(ScalarValue);
2399       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2400     } else {
2401       // Initialize packing with insertelements to start from undef.
2402       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2403       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2404       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2405       for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
2406         packScalarIntoVectorValue(V, {Part, Lane});
2407       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2408     }
2409     Builder.restoreIP(OldIP);
2410     return VectorValue;
2411   }
2412 
2413   // If this scalar is unknown, assume that it is a constant or that it is
2414   // loop invariant. Broadcast V and save the value for future uses.
2415   Value *B = getBroadcastInstrs(V);
2416   VectorLoopValueMap.setVectorValue(V, Part, B);
2417   return B;
2418 }
2419 
2420 Value *
2421 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2422                                             const VPIteration &Instance) {
2423   // If the value is not an instruction contained in the loop, it should
2424   // already be scalar.
2425   if (OrigLoop->isLoopInvariant(V))
2426     return V;
2427 
2428   assert(Instance.Lane > 0
2429              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2430              : true && "Uniform values only have lane zero");
2431 
2432   // If the value from the original loop has not been vectorized, it is
2433   // represented by UF x VF scalar values in the new loop. Return the requested
2434   // scalar value.
2435   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2436     return VectorLoopValueMap.getScalarValue(V, Instance);
2437 
2438   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2439   // for the given unroll part. If this entry is not a vector type (i.e., the
2440   // vectorization factor is one), there is no need to generate an
2441   // extractelement instruction.
2442   auto *U = getOrCreateVectorValue(V, Instance.Part);
2443   if (!U->getType()->isVectorTy()) {
2444     assert(VF.isScalar() && "Value not scalarized has non-vector type");
2445     return U;
2446   }
2447 
2448   // Otherwise, the value from the original loop has been vectorized and is
2449   // represented by UF vector values. Extract and return the requested scalar
2450   // value from the appropriate vector lane.
2451   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2452 }
2453 
2454 void InnerLoopVectorizer::packScalarIntoVectorValue(
2455     Value *V, const VPIteration &Instance) {
2456   assert(V != Induction && "The new induction variable should not be used.");
2457   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2458   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2459 
2460   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2461   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2462   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2463                                             Builder.getInt32(Instance.Lane));
2464   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2465 }
2466 
2467 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2468   assert(Vec->getType()->isVectorTy() && "Invalid type");
2469   assert(!VF.isScalable() && "Cannot reverse scalable vectors");
2470   SmallVector<int, 8> ShuffleMask;
2471   for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
2472     ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
2473 
2474   return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
2475 }
2476 
2477 // Return whether we allow using masked interleave-groups (for dealing with
2478 // strided loads/stores that reside in predicated blocks, or for dealing
2479 // with gaps).
2480 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2481   // If an override option has been passed in for interleaved accesses, use it.
2482   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2483     return EnableMaskedInterleavedMemAccesses;
2484 
2485   return TTI.enableMaskedInterleavedAccessVectorization();
2486 }
2487 
2488 // Try to vectorize the interleave group that \p Instr belongs to.
2489 //
2490 // E.g. Translate following interleaved load group (factor = 3):
2491 //   for (i = 0; i < N; i+=3) {
2492 //     R = Pic[i];             // Member of index 0
2493 //     G = Pic[i+1];           // Member of index 1
2494 //     B = Pic[i+2];           // Member of index 2
2495 //     ... // do something to R, G, B
2496 //   }
2497 // To:
2498 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2499 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2500 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2501 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2502 //
2503 // Or translate following interleaved store group (factor = 3):
2504 //   for (i = 0; i < N; i+=3) {
2505 //     ... do something to R, G, B
2506 //     Pic[i]   = R;           // Member of index 0
2507 //     Pic[i+1] = G;           // Member of index 1
2508 //     Pic[i+2] = B;           // Member of index 2
2509 //   }
2510 // To:
2511 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2512 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2513 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2514 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2515 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2516 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2517     const InterleaveGroup<Instruction> *Group, VPTransformState &State,
2518     VPValue *Addr, ArrayRef<VPValue *> StoredValues, VPValue *BlockInMask) {
2519   Instruction *Instr = Group->getInsertPos();
2520   const DataLayout &DL = Instr->getModule()->getDataLayout();
2521 
2522   // Prepare for the vector type of the interleaved load/store.
2523   Type *ScalarTy = getMemInstValueType(Instr);
2524   unsigned InterleaveFactor = Group->getFactor();
2525   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2526   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2527 
2528   // Prepare for the new pointers.
2529   SmallVector<Value *, 2> AddrParts;
2530   unsigned Index = Group->getIndex(Instr);
2531 
2532   // TODO: extend the masked interleaved-group support to reversed access.
2533   assert((!BlockInMask || !Group->isReverse()) &&
2534          "Reversed masked interleave-group not supported.");
2535 
2536   // If the group is reverse, adjust the index to refer to the last vector lane
2537   // instead of the first. We adjust the index from the first vector lane,
2538   // rather than directly getting the pointer for lane VF - 1, because the
2539   // pointer operand of the interleaved access is supposed to be uniform. For
2540   // uniform instructions, we're only required to generate a value for the
2541   // first vector lane in each unroll iteration.
2542   assert(!VF.isScalable() &&
2543          "scalable vector reverse operation is not implemented");
2544   if (Group->isReverse())
2545     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2546 
2547   for (unsigned Part = 0; Part < UF; Part++) {
2548     Value *AddrPart = State.get(Addr, {Part, 0});
2549     setDebugLocFromInst(Builder, AddrPart);
2550 
2551     // Notice current instruction could be any index. Need to adjust the address
2552     // to the member of index 0.
2553     //
2554     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2555     //       b = A[i];       // Member of index 0
2556     // Current pointer is pointed to A[i+1], adjust it to A[i].
2557     //
2558     // E.g.  A[i+1] = a;     // Member of index 1
2559     //       A[i]   = b;     // Member of index 0
2560     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2561     // Current pointer is pointed to A[i+2], adjust it to A[i].
2562 
2563     bool InBounds = false;
2564     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2565       InBounds = gep->isInBounds();
2566     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2567     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2568 
2569     // Cast to the vector pointer type.
2570     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2571     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2572     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2573   }
2574 
2575   setDebugLocFromInst(Builder, Instr);
2576   Value *UndefVec = UndefValue::get(VecTy);
2577 
2578   Value *MaskForGaps = nullptr;
2579   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2580     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2581     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2582     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2583   }
2584 
2585   // Vectorize the interleaved load group.
2586   if (isa<LoadInst>(Instr)) {
2587     // For each unroll part, create a wide load for the group.
2588     SmallVector<Value *, 2> NewLoads;
2589     for (unsigned Part = 0; Part < UF; Part++) {
2590       Instruction *NewLoad;
2591       if (BlockInMask || MaskForGaps) {
2592         assert(useMaskedInterleavedAccesses(*TTI) &&
2593                "masked interleaved groups are not allowed.");
2594         Value *GroupMask = MaskForGaps;
2595         if (BlockInMask) {
2596           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2597           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2598           Value *ShuffledMask = Builder.CreateShuffleVector(
2599               BlockInMaskPart,
2600               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2601               "interleaved.mask");
2602           GroupMask = MaskForGaps
2603                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2604                                                 MaskForGaps)
2605                           : ShuffledMask;
2606         }
2607         NewLoad =
2608             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2609                                      GroupMask, UndefVec, "wide.masked.vec");
2610       }
2611       else
2612         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2613                                             Group->getAlign(), "wide.vec");
2614       Group->addMetadata(NewLoad);
2615       NewLoads.push_back(NewLoad);
2616     }
2617 
2618     // For each member in the group, shuffle out the appropriate data from the
2619     // wide loads.
2620     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2621       Instruction *Member = Group->getMember(I);
2622 
2623       // Skip the gaps in the group.
2624       if (!Member)
2625         continue;
2626 
2627       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2628       auto StrideMask =
2629           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2630       for (unsigned Part = 0; Part < UF; Part++) {
2631         Value *StridedVec = Builder.CreateShuffleVector(
2632             NewLoads[Part], StrideMask, "strided.vec");
2633 
2634         // If this member has different type, cast the result type.
2635         if (Member->getType() != ScalarTy) {
2636           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2637           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2638           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2639         }
2640 
2641         if (Group->isReverse())
2642           StridedVec = reverseVector(StridedVec);
2643 
2644         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2645       }
2646     }
2647     return;
2648   }
2649 
2650   // The sub vector type for current instruction.
2651   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2652   auto *SubVT = VectorType::get(ScalarTy, VF);
2653 
2654   // Vectorize the interleaved store group.
2655   for (unsigned Part = 0; Part < UF; Part++) {
2656     // Collect the stored vector from each member.
2657     SmallVector<Value *, 4> StoredVecs;
2658     for (unsigned i = 0; i < InterleaveFactor; i++) {
2659       // Interleaved store group doesn't allow a gap, so each index has a member
2660       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2661 
2662       Value *StoredVec = State.get(StoredValues[i], Part);
2663 
2664       if (Group->isReverse())
2665         StoredVec = reverseVector(StoredVec);
2666 
2667       // If this member has different type, cast it to a unified type.
2668 
2669       if (StoredVec->getType() != SubVT)
2670         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2671 
2672       StoredVecs.push_back(StoredVec);
2673     }
2674 
2675     // Concatenate all vectors into a wide vector.
2676     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2677 
2678     // Interleave the elements in the wide vector.
2679     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2680     Value *IVec = Builder.CreateShuffleVector(
2681         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2682         "interleaved.vec");
2683 
2684     Instruction *NewStoreInstr;
2685     if (BlockInMask) {
2686       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2687       Value *ShuffledMask = Builder.CreateShuffleVector(
2688           BlockInMaskPart,
2689           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2690           "interleaved.mask");
2691       NewStoreInstr = Builder.CreateMaskedStore(
2692           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2693     }
2694     else
2695       NewStoreInstr =
2696           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2697 
2698     Group->addMetadata(NewStoreInstr);
2699   }
2700 }
2701 
2702 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2703     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2704     VPValue *StoredValue, VPValue *BlockInMask) {
2705   // Attempt to issue a wide load.
2706   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2707   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2708 
2709   assert((LI || SI) && "Invalid Load/Store instruction");
2710   assert((!SI || StoredValue) && "No stored value provided for widened store");
2711   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2712 
2713   LoopVectorizationCostModel::InstWidening Decision =
2714       Cost->getWideningDecision(Instr, VF);
2715   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2716           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2717           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2718          "CM decision is not to widen the memory instruction");
2719 
2720   Type *ScalarDataTy = getMemInstValueType(Instr);
2721 
2722   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2723   const Align Alignment = getLoadStoreAlignment(Instr);
2724 
2725   // Determine if the pointer operand of the access is either consecutive or
2726   // reverse consecutive.
2727   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2728   bool ConsecutiveStride =
2729       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2730   bool CreateGatherScatter =
2731       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2732 
2733   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2734   // gather/scatter. Otherwise Decision should have been to Scalarize.
2735   assert((ConsecutiveStride || CreateGatherScatter) &&
2736          "The instruction should be scalarized");
2737   (void)ConsecutiveStride;
2738 
2739   VectorParts BlockInMaskParts(UF);
2740   bool isMaskRequired = BlockInMask;
2741   if (isMaskRequired)
2742     for (unsigned Part = 0; Part < UF; ++Part)
2743       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2744 
2745   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2746     // Calculate the pointer for the specific unroll-part.
2747     GetElementPtrInst *PartPtr = nullptr;
2748 
2749     bool InBounds = false;
2750     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2751       InBounds = gep->isInBounds();
2752 
2753     if (Reverse) {
2754       assert(!VF.isScalable() &&
2755              "Reversing vectors is not yet supported for scalable vectors.");
2756 
2757       // If the address is consecutive but reversed, then the
2758       // wide store needs to start at the last vector element.
2759       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2760           ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
2761       PartPtr->setIsInBounds(InBounds);
2762       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
2763           ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
2764       PartPtr->setIsInBounds(InBounds);
2765       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2766         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2767     } else {
2768       Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2769       PartPtr = cast<GetElementPtrInst>(
2770           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2771       PartPtr->setIsInBounds(InBounds);
2772     }
2773 
2774     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2775     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2776   };
2777 
2778   // Handle Stores:
2779   if (SI) {
2780     setDebugLocFromInst(Builder, SI);
2781 
2782     for (unsigned Part = 0; Part < UF; ++Part) {
2783       Instruction *NewSI = nullptr;
2784       Value *StoredVal = State.get(StoredValue, Part);
2785       if (CreateGatherScatter) {
2786         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2787         Value *VectorGep = State.get(Addr, Part);
2788         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2789                                             MaskPart);
2790       } else {
2791         if (Reverse) {
2792           // If we store to reverse consecutive memory locations, then we need
2793           // to reverse the order of elements in the stored value.
2794           StoredVal = reverseVector(StoredVal);
2795           // We don't want to update the value in the map as it might be used in
2796           // another expression. So don't call resetVectorValue(StoredVal).
2797         }
2798         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2799         if (isMaskRequired)
2800           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2801                                             BlockInMaskParts[Part]);
2802         else
2803           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2804       }
2805       addMetadata(NewSI, SI);
2806     }
2807     return;
2808   }
2809 
2810   // Handle loads.
2811   assert(LI && "Must have a load instruction");
2812   setDebugLocFromInst(Builder, LI);
2813   for (unsigned Part = 0; Part < UF; ++Part) {
2814     Value *NewLI;
2815     if (CreateGatherScatter) {
2816       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2817       Value *VectorGep = State.get(Addr, Part);
2818       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2819                                          nullptr, "wide.masked.gather");
2820       addMetadata(NewLI, LI);
2821     } else {
2822       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2823       if (isMaskRequired)
2824         NewLI = Builder.CreateMaskedLoad(
2825             VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
2826             "wide.masked.load");
2827       else
2828         NewLI =
2829             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2830 
2831       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2832       addMetadata(NewLI, LI);
2833       if (Reverse)
2834         NewLI = reverseVector(NewLI);
2835     }
2836 
2837     State.set(Def, Instr, NewLI, Part);
2838   }
2839 }
2840 
2841 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
2842                                                const VPIteration &Instance,
2843                                                bool IfPredicateInstr,
2844                                                VPTransformState &State) {
2845   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2846 
2847   setDebugLocFromInst(Builder, Instr);
2848 
2849   // Does this instruction return a value ?
2850   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2851 
2852   Instruction *Cloned = Instr->clone();
2853   if (!IsVoidRetTy)
2854     Cloned->setName(Instr->getName() + ".cloned");
2855 
2856   // Replace the operands of the cloned instructions with their scalar
2857   // equivalents in the new loop.
2858   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2859     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2860     auto InputInstance = Instance;
2861     if (!Operand || !OrigLoop->contains(Operand) ||
2862         (Cost->isUniformAfterVectorization(Operand, State.VF)))
2863       InputInstance.Lane = 0;
2864     auto *NewOp = State.get(User.getOperand(op), InputInstance);
2865     Cloned->setOperand(op, NewOp);
2866   }
2867   addNewMetadata(Cloned, Instr);
2868 
2869   // Place the cloned scalar in the new loop.
2870   Builder.Insert(Cloned);
2871 
2872   // TODO: Set result for VPValue of VPReciplicateRecipe. This requires
2873   // representing scalar values in VPTransformState. Add the cloned scalar to
2874   // the scalar map entry.
2875   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2876 
2877   // If we just cloned a new assumption, add it the assumption cache.
2878   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2879     if (II->getIntrinsicID() == Intrinsic::assume)
2880       AC->registerAssumption(II);
2881 
2882   // End if-block.
2883   if (IfPredicateInstr)
2884     PredicatedInstructions.push_back(Cloned);
2885 }
2886 
2887 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2888                                                       Value *End, Value *Step,
2889                                                       Instruction *DL) {
2890   BasicBlock *Header = L->getHeader();
2891   BasicBlock *Latch = L->getLoopLatch();
2892   // As we're just creating this loop, it's possible no latch exists
2893   // yet. If so, use the header as this will be a single block loop.
2894   if (!Latch)
2895     Latch = Header;
2896 
2897   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2898   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2899   setDebugLocFromInst(Builder, OldInst);
2900   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2901 
2902   Builder.SetInsertPoint(Latch->getTerminator());
2903   setDebugLocFromInst(Builder, OldInst);
2904 
2905   // Create i+1 and fill the PHINode.
2906   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2907   Induction->addIncoming(Start, L->getLoopPreheader());
2908   Induction->addIncoming(Next, Latch);
2909   // Create the compare.
2910   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2911   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2912 
2913   // Now we have two terminators. Remove the old one from the block.
2914   Latch->getTerminator()->eraseFromParent();
2915 
2916   return Induction;
2917 }
2918 
2919 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2920   if (TripCount)
2921     return TripCount;
2922 
2923   assert(L && "Create Trip Count for null loop.");
2924   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2925   // Find the loop boundaries.
2926   ScalarEvolution *SE = PSE.getSE();
2927   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2928   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2929          "Invalid loop count");
2930 
2931   Type *IdxTy = Legal->getWidestInductionType();
2932   assert(IdxTy && "No type for induction");
2933 
2934   // The exit count might have the type of i64 while the phi is i32. This can
2935   // happen if we have an induction variable that is sign extended before the
2936   // compare. The only way that we get a backedge taken count is that the
2937   // induction variable was signed and as such will not overflow. In such a case
2938   // truncation is legal.
2939   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2940       IdxTy->getPrimitiveSizeInBits())
2941     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2942   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2943 
2944   // Get the total trip count from the count by adding 1.
2945   const SCEV *ExitCount = SE->getAddExpr(
2946       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2947 
2948   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2949 
2950   // Expand the trip count and place the new instructions in the preheader.
2951   // Notice that the pre-header does not change, only the loop body.
2952   SCEVExpander Exp(*SE, DL, "induction");
2953 
2954   // Count holds the overall loop count (N).
2955   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2956                                 L->getLoopPreheader()->getTerminator());
2957 
2958   if (TripCount->getType()->isPointerTy())
2959     TripCount =
2960         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2961                                     L->getLoopPreheader()->getTerminator());
2962 
2963   return TripCount;
2964 }
2965 
2966 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2967   if (VectorTripCount)
2968     return VectorTripCount;
2969 
2970   Value *TC = getOrCreateTripCount(L);
2971   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2972 
2973   Type *Ty = TC->getType();
2974   // This is where we can make the step a runtime constant.
2975   Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
2976 
2977   // If the tail is to be folded by masking, round the number of iterations N
2978   // up to a multiple of Step instead of rounding down. This is done by first
2979   // adding Step-1 and then rounding down. Note that it's ok if this addition
2980   // overflows: the vector induction variable will eventually wrap to zero given
2981   // that it starts at zero and its Step is a power of two; the loop will then
2982   // exit, with the last early-exit vector comparison also producing all-true.
2983   if (Cost->foldTailByMasking()) {
2984     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2985            "VF*UF must be a power of 2 when folding tail by masking");
2986     assert(!VF.isScalable() &&
2987            "Tail folding not yet supported for scalable vectors");
2988     TC = Builder.CreateAdd(
2989         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
2990   }
2991 
2992   // Now we need to generate the expression for the part of the loop that the
2993   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2994   // iterations are not required for correctness, or N - Step, otherwise. Step
2995   // is equal to the vectorization factor (number of SIMD elements) times the
2996   // unroll factor (number of SIMD instructions).
2997   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2998 
2999   // If there is a non-reversed interleaved group that may speculatively access
3000   // memory out-of-bounds, we need to ensure that there will be at least one
3001   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
3002   // the trip count, we set the remainder to be equal to the step. If the step
3003   // does not evenly divide the trip count, no adjustment is necessary since
3004   // there will already be scalar iterations. Note that the minimum iterations
3005   // check ensures that N >= Step.
3006   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3007     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3008     R = Builder.CreateSelect(IsZero, Step, R);
3009   }
3010 
3011   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3012 
3013   return VectorTripCount;
3014 }
3015 
3016 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3017                                                    const DataLayout &DL) {
3018   // Verify that V is a vector type with same number of elements as DstVTy.
3019   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3020   unsigned VF = DstFVTy->getNumElements();
3021   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3022   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3023   Type *SrcElemTy = SrcVecTy->getElementType();
3024   Type *DstElemTy = DstFVTy->getElementType();
3025   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3026          "Vector elements must have same size");
3027 
3028   // Do a direct cast if element types are castable.
3029   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3030     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3031   }
3032   // V cannot be directly casted to desired vector type.
3033   // May happen when V is a floating point vector but DstVTy is a vector of
3034   // pointers or vice-versa. Handle this using a two-step bitcast using an
3035   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3036   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3037          "Only one type should be a pointer type");
3038   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3039          "Only one type should be a floating point type");
3040   Type *IntTy =
3041       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3042   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3043   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3044   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3045 }
3046 
3047 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3048                                                          BasicBlock *Bypass) {
3049   Value *Count = getOrCreateTripCount(L);
3050   // Reuse existing vector loop preheader for TC checks.
3051   // Note that new preheader block is generated for vector loop.
3052   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3053   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3054 
3055   // Generate code to check if the loop's trip count is less than VF * UF, or
3056   // equal to it in case a scalar epilogue is required; this implies that the
3057   // vector trip count is zero. This check also covers the case where adding one
3058   // to the backedge-taken count overflowed leading to an incorrect trip count
3059   // of zero. In this case we will also jump to the scalar loop.
3060   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3061                                           : ICmpInst::ICMP_ULT;
3062 
3063   // If tail is to be folded, vector loop takes care of all iterations.
3064   Value *CheckMinIters = Builder.getFalse();
3065   if (!Cost->foldTailByMasking()) {
3066     Value *Step =
3067         createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3068     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3069   }
3070   // Create new preheader for vector loop.
3071   LoopVectorPreHeader =
3072       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3073                  "vector.ph");
3074 
3075   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3076                                DT->getNode(Bypass)->getIDom()) &&
3077          "TC check is expected to dominate Bypass");
3078 
3079   // Update dominator for Bypass & LoopExit.
3080   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3081   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3082 
3083   ReplaceInstWithInst(
3084       TCCheckBlock->getTerminator(),
3085       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3086   LoopBypassBlocks.push_back(TCCheckBlock);
3087 }
3088 
3089 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3090   // Reuse existing vector loop preheader for SCEV checks.
3091   // Note that new preheader block is generated for vector loop.
3092   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
3093 
3094   // Generate the code to check that the SCEV assumptions that we made.
3095   // We want the new basic block to start at the first instruction in a
3096   // sequence of instructions that form a check.
3097   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
3098                    "scev.check");
3099   Value *SCEVCheck = Exp.expandCodeForPredicate(
3100       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
3101 
3102   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
3103     if (C->isZero())
3104       return;
3105 
3106   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3107            (OptForSizeBasedOnProfile &&
3108             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3109          "Cannot SCEV check stride or overflow when optimizing for size");
3110 
3111   SCEVCheckBlock->setName("vector.scevcheck");
3112   // Create new preheader for vector loop.
3113   LoopVectorPreHeader =
3114       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
3115                  nullptr, "vector.ph");
3116 
3117   // Update dominator only if this is first RT check.
3118   if (LoopBypassBlocks.empty()) {
3119     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3120     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3121   }
3122 
3123   ReplaceInstWithInst(
3124       SCEVCheckBlock->getTerminator(),
3125       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
3126   LoopBypassBlocks.push_back(SCEVCheckBlock);
3127   AddedSafetyChecks = true;
3128 }
3129 
3130 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3131   // VPlan-native path does not do any analysis for runtime checks currently.
3132   if (EnableVPlanNativePath)
3133     return;
3134 
3135   // Reuse existing vector loop preheader for runtime memory checks.
3136   // Note that new preheader block is generated for vector loop.
3137   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
3138 
3139   // Generate the code that checks in runtime if arrays overlap. We put the
3140   // checks into a separate block to make the more common case of few elements
3141   // faster.
3142   auto *LAI = Legal->getLAI();
3143   const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
3144   if (!RtPtrChecking.Need)
3145     return;
3146 
3147   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3148     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3149            "Cannot emit memory checks when optimizing for size, unless forced "
3150            "to vectorize.");
3151     ORE->emit([&]() {
3152       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3153                                         L->getStartLoc(), L->getHeader())
3154              << "Code-size may be reduced by not forcing "
3155                 "vectorization, or by source-code modifications "
3156                 "eliminating the need for runtime checks "
3157                 "(e.g., adding 'restrict').";
3158     });
3159   }
3160 
3161   MemCheckBlock->setName("vector.memcheck");
3162   // Create new preheader for vector loop.
3163   LoopVectorPreHeader =
3164       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
3165                  "vector.ph");
3166 
3167   auto *CondBranch = cast<BranchInst>(
3168       Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
3169   ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
3170   LoopBypassBlocks.push_back(MemCheckBlock);
3171   AddedSafetyChecks = true;
3172 
3173   // Update dominator only if this is first RT check.
3174   if (LoopBypassBlocks.empty()) {
3175     DT->changeImmediateDominator(Bypass, MemCheckBlock);
3176     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
3177   }
3178 
3179   Instruction *FirstCheckInst;
3180   Instruction *MemRuntimeCheck;
3181   std::tie(FirstCheckInst, MemRuntimeCheck) =
3182       addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
3183                        RtPtrChecking.getChecks(), RtPtrChecking.getSE());
3184   assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
3185                             "claimed checks are required");
3186   CondBranch->setCondition(MemRuntimeCheck);
3187 
3188   // We currently don't use LoopVersioning for the actual loop cloning but we
3189   // still use it to add the noalias metadata.
3190   LVer = std::make_unique<LoopVersioning>(
3191       *Legal->getLAI(),
3192       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3193       DT, PSE.getSE());
3194   LVer->prepareNoAliasMetadata();
3195 }
3196 
3197 Value *InnerLoopVectorizer::emitTransformedIndex(
3198     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3199     const InductionDescriptor &ID) const {
3200 
3201   SCEVExpander Exp(*SE, DL, "induction");
3202   auto Step = ID.getStep();
3203   auto StartValue = ID.getStartValue();
3204   assert(Index->getType() == Step->getType() &&
3205          "Index type does not match StepValue type");
3206 
3207   // Note: the IR at this point is broken. We cannot use SE to create any new
3208   // SCEV and then expand it, hoping that SCEV's simplification will give us
3209   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3210   // lead to various SCEV crashes. So all we can do is to use builder and rely
3211   // on InstCombine for future simplifications. Here we handle some trivial
3212   // cases only.
3213   auto CreateAdd = [&B](Value *X, Value *Y) {
3214     assert(X->getType() == Y->getType() && "Types don't match!");
3215     if (auto *CX = dyn_cast<ConstantInt>(X))
3216       if (CX->isZero())
3217         return Y;
3218     if (auto *CY = dyn_cast<ConstantInt>(Y))
3219       if (CY->isZero())
3220         return X;
3221     return B.CreateAdd(X, Y);
3222   };
3223 
3224   auto CreateMul = [&B](Value *X, Value *Y) {
3225     assert(X->getType() == Y->getType() && "Types don't match!");
3226     if (auto *CX = dyn_cast<ConstantInt>(X))
3227       if (CX->isOne())
3228         return Y;
3229     if (auto *CY = dyn_cast<ConstantInt>(Y))
3230       if (CY->isOne())
3231         return X;
3232     return B.CreateMul(X, Y);
3233   };
3234 
3235   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3236   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3237   // the DomTree is not kept up-to-date for additional blocks generated in the
3238   // vector loop. By using the header as insertion point, we guarantee that the
3239   // expanded instructions dominate all their uses.
3240   auto GetInsertPoint = [this, &B]() {
3241     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3242     if (InsertBB != LoopVectorBody &&
3243         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3244       return LoopVectorBody->getTerminator();
3245     return &*B.GetInsertPoint();
3246   };
3247   switch (ID.getKind()) {
3248   case InductionDescriptor::IK_IntInduction: {
3249     assert(Index->getType() == StartValue->getType() &&
3250            "Index type does not match StartValue type");
3251     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3252       return B.CreateSub(StartValue, Index);
3253     auto *Offset = CreateMul(
3254         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3255     return CreateAdd(StartValue, Offset);
3256   }
3257   case InductionDescriptor::IK_PtrInduction: {
3258     assert(isa<SCEVConstant>(Step) &&
3259            "Expected constant step for pointer induction");
3260     return B.CreateGEP(
3261         StartValue->getType()->getPointerElementType(), StartValue,
3262         CreateMul(Index,
3263                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3264   }
3265   case InductionDescriptor::IK_FpInduction: {
3266     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3267     auto InductionBinOp = ID.getInductionBinOp();
3268     assert(InductionBinOp &&
3269            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3270             InductionBinOp->getOpcode() == Instruction::FSub) &&
3271            "Original bin op should be defined for FP induction");
3272 
3273     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3274 
3275     // Floating point operations had to be 'fast' to enable the induction.
3276     FastMathFlags Flags;
3277     Flags.setFast();
3278 
3279     Value *MulExp = B.CreateFMul(StepValue, Index);
3280     if (isa<Instruction>(MulExp))
3281       // We have to check, the MulExp may be a constant.
3282       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
3283 
3284     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3285                                "induction");
3286     if (isa<Instruction>(BOp))
3287       cast<Instruction>(BOp)->setFastMathFlags(Flags);
3288 
3289     return BOp;
3290   }
3291   case InductionDescriptor::IK_NoInduction:
3292     return nullptr;
3293   }
3294   llvm_unreachable("invalid enum");
3295 }
3296 
3297 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3298   LoopScalarBody = OrigLoop->getHeader();
3299   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3300   LoopExitBlock = OrigLoop->getExitBlock();
3301   assert(LoopExitBlock && "Must have an exit block");
3302   assert(LoopVectorPreHeader && "Invalid loop structure");
3303 
3304   LoopMiddleBlock =
3305       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3306                  LI, nullptr, Twine(Prefix) + "middle.block");
3307   LoopScalarPreHeader =
3308       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3309                  nullptr, Twine(Prefix) + "scalar.ph");
3310   // We intentionally don't let SplitBlock to update LoopInfo since
3311   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3312   // LoopVectorBody is explicitly added to the correct place few lines later.
3313   LoopVectorBody =
3314       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3315                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3316 
3317   // Update dominator for loop exit.
3318   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3319 
3320   // Create and register the new vector loop.
3321   Loop *Lp = LI->AllocateLoop();
3322   Loop *ParentLoop = OrigLoop->getParentLoop();
3323 
3324   // Insert the new loop into the loop nest and register the new basic blocks
3325   // before calling any utilities such as SCEV that require valid LoopInfo.
3326   if (ParentLoop) {
3327     ParentLoop->addChildLoop(Lp);
3328   } else {
3329     LI->addTopLevelLoop(Lp);
3330   }
3331   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3332   return Lp;
3333 }
3334 
3335 void InnerLoopVectorizer::createInductionResumeValues(
3336     Loop *L, Value *VectorTripCount,
3337     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3338   assert(VectorTripCount && L && "Expected valid arguments");
3339   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3340           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3341          "Inconsistent information about additional bypass.");
3342   // We are going to resume the execution of the scalar loop.
3343   // Go over all of the induction variables that we found and fix the
3344   // PHIs that are left in the scalar version of the loop.
3345   // The starting values of PHI nodes depend on the counter of the last
3346   // iteration in the vectorized loop.
3347   // If we come from a bypass edge then we need to start from the original
3348   // start value.
3349   for (auto &InductionEntry : Legal->getInductionVars()) {
3350     PHINode *OrigPhi = InductionEntry.first;
3351     InductionDescriptor II = InductionEntry.second;
3352 
3353     // Create phi nodes to merge from the  backedge-taken check block.
3354     PHINode *BCResumeVal =
3355         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3356                         LoopScalarPreHeader->getTerminator());
3357     // Copy original phi DL over to the new one.
3358     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3359     Value *&EndValue = IVEndValues[OrigPhi];
3360     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3361     if (OrigPhi == OldInduction) {
3362       // We know what the end value is.
3363       EndValue = VectorTripCount;
3364     } else {
3365       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3366       Type *StepType = II.getStep()->getType();
3367       Instruction::CastOps CastOp =
3368           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3369       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3370       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3371       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3372       EndValue->setName("ind.end");
3373 
3374       // Compute the end value for the additional bypass (if applicable).
3375       if (AdditionalBypass.first) {
3376         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3377         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3378                                          StepType, true);
3379         CRD =
3380             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3381         EndValueFromAdditionalBypass =
3382             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3383         EndValueFromAdditionalBypass->setName("ind.end");
3384       }
3385     }
3386     // The new PHI merges the original incoming value, in case of a bypass,
3387     // or the value at the end of the vectorized loop.
3388     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3389 
3390     // Fix the scalar body counter (PHI node).
3391     // The old induction's phi node in the scalar body needs the truncated
3392     // value.
3393     for (BasicBlock *BB : LoopBypassBlocks)
3394       BCResumeVal->addIncoming(II.getStartValue(), BB);
3395 
3396     if (AdditionalBypass.first)
3397       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3398                                             EndValueFromAdditionalBypass);
3399 
3400     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3401   }
3402 }
3403 
3404 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3405                                                       MDNode *OrigLoopID) {
3406   assert(L && "Expected valid loop.");
3407 
3408   // The trip counts should be cached by now.
3409   Value *Count = getOrCreateTripCount(L);
3410   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3411 
3412   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3413 
3414   // Add a check in the middle block to see if we have completed
3415   // all of the iterations in the first vector loop.
3416   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3417   // If tail is to be folded, we know we don't need to run the remainder.
3418   Value *CmpN = Builder.getTrue();
3419   if (!Cost->foldTailByMasking()) {
3420     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3421                            VectorTripCount, "cmp.n",
3422                            LoopMiddleBlock->getTerminator());
3423 
3424     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3425     // of the corresponding compare because they may have ended up with
3426     // different line numbers and we want to avoid awkward line stepping while
3427     // debugging. Eg. if the compare has got a line number inside the loop.
3428     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3429   }
3430 
3431   BranchInst *BrInst =
3432       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3433   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3434   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3435 
3436   // Get ready to start creating new instructions into the vectorized body.
3437   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3438          "Inconsistent vector loop preheader");
3439   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3440 
3441   Optional<MDNode *> VectorizedLoopID =
3442       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3443                                       LLVMLoopVectorizeFollowupVectorized});
3444   if (VectorizedLoopID.hasValue()) {
3445     L->setLoopID(VectorizedLoopID.getValue());
3446 
3447     // Do not setAlreadyVectorized if loop attributes have been defined
3448     // explicitly.
3449     return LoopVectorPreHeader;
3450   }
3451 
3452   // Keep all loop hints from the original loop on the vector loop (we'll
3453   // replace the vectorizer-specific hints below).
3454   if (MDNode *LID = OrigLoop->getLoopID())
3455     L->setLoopID(LID);
3456 
3457   LoopVectorizeHints Hints(L, true, *ORE);
3458   Hints.setAlreadyVectorized();
3459 
3460 #ifdef EXPENSIVE_CHECKS
3461   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3462   LI->verify(*DT);
3463 #endif
3464 
3465   return LoopVectorPreHeader;
3466 }
3467 
3468 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3469   /*
3470    In this function we generate a new loop. The new loop will contain
3471    the vectorized instructions while the old loop will continue to run the
3472    scalar remainder.
3473 
3474        [ ] <-- loop iteration number check.
3475     /   |
3476    /    v
3477   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3478   |  /  |
3479   | /   v
3480   ||   [ ]     <-- vector pre header.
3481   |/    |
3482   |     v
3483   |    [  ] \
3484   |    [  ]_|   <-- vector loop.
3485   |     |
3486   |     v
3487   |   -[ ]   <--- middle-block.
3488   |  /  |
3489   | /   v
3490   -|- >[ ]     <--- new preheader.
3491    |    |
3492    |    v
3493    |   [ ] \
3494    |   [ ]_|   <-- old scalar loop to handle remainder.
3495     \   |
3496      \  v
3497       >[ ]     <-- exit block.
3498    ...
3499    */
3500 
3501   // Get the metadata of the original loop before it gets modified.
3502   MDNode *OrigLoopID = OrigLoop->getLoopID();
3503 
3504   // Create an empty vector loop, and prepare basic blocks for the runtime
3505   // checks.
3506   Loop *Lp = createVectorLoopSkeleton("");
3507 
3508   // Now, compare the new count to zero. If it is zero skip the vector loop and
3509   // jump to the scalar loop. This check also covers the case where the
3510   // backedge-taken count is uint##_max: adding one to it will overflow leading
3511   // to an incorrect trip count of zero. In this (rare) case we will also jump
3512   // to the scalar loop.
3513   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3514 
3515   // Generate the code to check any assumptions that we've made for SCEV
3516   // expressions.
3517   emitSCEVChecks(Lp, LoopScalarPreHeader);
3518 
3519   // Generate the code that checks in runtime if arrays overlap. We put the
3520   // checks into a separate block to make the more common case of few elements
3521   // faster.
3522   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3523 
3524   // Some loops have a single integer induction variable, while other loops
3525   // don't. One example is c++ iterators that often have multiple pointer
3526   // induction variables. In the code below we also support a case where we
3527   // don't have a single induction variable.
3528   //
3529   // We try to obtain an induction variable from the original loop as hard
3530   // as possible. However if we don't find one that:
3531   //   - is an integer
3532   //   - counts from zero, stepping by one
3533   //   - is the size of the widest induction variable type
3534   // then we create a new one.
3535   OldInduction = Legal->getPrimaryInduction();
3536   Type *IdxTy = Legal->getWidestInductionType();
3537   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3538   // The loop step is equal to the vectorization factor (num of SIMD elements)
3539   // times the unroll factor (num of SIMD instructions).
3540   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3541   Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3542   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3543   Induction =
3544       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3545                               getDebugLocFromInstOrOperands(OldInduction));
3546 
3547   // Emit phis for the new starting index of the scalar loop.
3548   createInductionResumeValues(Lp, CountRoundDown);
3549 
3550   return completeLoopSkeleton(Lp, OrigLoopID);
3551 }
3552 
3553 // Fix up external users of the induction variable. At this point, we are
3554 // in LCSSA form, with all external PHIs that use the IV having one input value,
3555 // coming from the remainder loop. We need those PHIs to also have a correct
3556 // value for the IV when arriving directly from the middle block.
3557 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3558                                        const InductionDescriptor &II,
3559                                        Value *CountRoundDown, Value *EndValue,
3560                                        BasicBlock *MiddleBlock) {
3561   // There are two kinds of external IV usages - those that use the value
3562   // computed in the last iteration (the PHI) and those that use the penultimate
3563   // value (the value that feeds into the phi from the loop latch).
3564   // We allow both, but they, obviously, have different values.
3565 
3566   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3567 
3568   DenseMap<Value *, Value *> MissingVals;
3569 
3570   // An external user of the last iteration's value should see the value that
3571   // the remainder loop uses to initialize its own IV.
3572   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3573   for (User *U : PostInc->users()) {
3574     Instruction *UI = cast<Instruction>(U);
3575     if (!OrigLoop->contains(UI)) {
3576       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3577       MissingVals[UI] = EndValue;
3578     }
3579   }
3580 
3581   // An external user of the penultimate value need to see EndValue - Step.
3582   // The simplest way to get this is to recompute it from the constituent SCEVs,
3583   // that is Start + (Step * (CRD - 1)).
3584   for (User *U : OrigPhi->users()) {
3585     auto *UI = cast<Instruction>(U);
3586     if (!OrigLoop->contains(UI)) {
3587       const DataLayout &DL =
3588           OrigLoop->getHeader()->getModule()->getDataLayout();
3589       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3590 
3591       IRBuilder<> B(MiddleBlock->getTerminator());
3592       Value *CountMinusOne = B.CreateSub(
3593           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3594       Value *CMO =
3595           !II.getStep()->getType()->isIntegerTy()
3596               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3597                              II.getStep()->getType())
3598               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3599       CMO->setName("cast.cmo");
3600       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3601       Escape->setName("ind.escape");
3602       MissingVals[UI] = Escape;
3603     }
3604   }
3605 
3606   for (auto &I : MissingVals) {
3607     PHINode *PHI = cast<PHINode>(I.first);
3608     // One corner case we have to handle is two IVs "chasing" each-other,
3609     // that is %IV2 = phi [...], [ %IV1, %latch ]
3610     // In this case, if IV1 has an external use, we need to avoid adding both
3611     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3612     // don't already have an incoming value for the middle block.
3613     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3614       PHI->addIncoming(I.second, MiddleBlock);
3615   }
3616 }
3617 
3618 namespace {
3619 
3620 struct CSEDenseMapInfo {
3621   static bool canHandle(const Instruction *I) {
3622     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3623            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3624   }
3625 
3626   static inline Instruction *getEmptyKey() {
3627     return DenseMapInfo<Instruction *>::getEmptyKey();
3628   }
3629 
3630   static inline Instruction *getTombstoneKey() {
3631     return DenseMapInfo<Instruction *>::getTombstoneKey();
3632   }
3633 
3634   static unsigned getHashValue(const Instruction *I) {
3635     assert(canHandle(I) && "Unknown instruction!");
3636     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3637                                                            I->value_op_end()));
3638   }
3639 
3640   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3641     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3642         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3643       return LHS == RHS;
3644     return LHS->isIdenticalTo(RHS);
3645   }
3646 };
3647 
3648 } // end anonymous namespace
3649 
3650 ///Perform cse of induction variable instructions.
3651 static void cse(BasicBlock *BB) {
3652   // Perform simple cse.
3653   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3654   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3655     Instruction *In = &*I++;
3656 
3657     if (!CSEDenseMapInfo::canHandle(In))
3658       continue;
3659 
3660     // Check if we can replace this instruction with any of the
3661     // visited instructions.
3662     if (Instruction *V = CSEMap.lookup(In)) {
3663       In->replaceAllUsesWith(V);
3664       In->eraseFromParent();
3665       continue;
3666     }
3667 
3668     CSEMap[In] = In;
3669   }
3670 }
3671 
3672 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3673                                                        ElementCount VF,
3674                                                        bool &NeedToScalarize) {
3675   assert(!VF.isScalable() && "scalable vectors not yet supported.");
3676   Function *F = CI->getCalledFunction();
3677   Type *ScalarRetTy = CI->getType();
3678   SmallVector<Type *, 4> Tys, ScalarTys;
3679   for (auto &ArgOp : CI->arg_operands())
3680     ScalarTys.push_back(ArgOp->getType());
3681 
3682   // Estimate cost of scalarized vector call. The source operands are assumed
3683   // to be vectors, so we need to extract individual elements from there,
3684   // execute VF scalar calls, and then gather the result into the vector return
3685   // value.
3686   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
3687                                                  TTI::TCK_RecipThroughput);
3688   if (VF.isScalar())
3689     return ScalarCallCost;
3690 
3691   // Compute corresponding vector type for return value and arguments.
3692   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3693   for (Type *ScalarTy : ScalarTys)
3694     Tys.push_back(ToVectorTy(ScalarTy, VF));
3695 
3696   // Compute costs of unpacking argument values for the scalar calls and
3697   // packing the return values to a vector.
3698   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3699 
3700   unsigned Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3701 
3702   // If we can't emit a vector call for this function, then the currently found
3703   // cost is the cost we need to return.
3704   NeedToScalarize = true;
3705   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3706   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3707 
3708   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3709     return Cost;
3710 
3711   // If the corresponding vector cost is cheaper, return its cost.
3712   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
3713                                                  TTI::TCK_RecipThroughput);
3714   if (VectorCallCost < Cost) {
3715     NeedToScalarize = false;
3716     return VectorCallCost;
3717   }
3718   return Cost;
3719 }
3720 
3721 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3722                                                             ElementCount VF) {
3723   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3724   assert(ID && "Expected intrinsic call!");
3725 
3726   IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
3727   return TTI.getIntrinsicInstrCost(CostAttrs,
3728                                    TargetTransformInfo::TCK_RecipThroughput);
3729 }
3730 
3731 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3732   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3733   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3734   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3735 }
3736 
3737 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3738   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3739   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3740   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3741 }
3742 
3743 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3744   // For every instruction `I` in MinBWs, truncate the operands, create a
3745   // truncated version of `I` and reextend its result. InstCombine runs
3746   // later and will remove any ext/trunc pairs.
3747   SmallPtrSet<Value *, 4> Erased;
3748   for (const auto &KV : Cost->getMinimalBitwidths()) {
3749     // If the value wasn't vectorized, we must maintain the original scalar
3750     // type. The absence of the value from VectorLoopValueMap indicates that it
3751     // wasn't vectorized.
3752     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3753       continue;
3754     for (unsigned Part = 0; Part < UF; ++Part) {
3755       Value *I = getOrCreateVectorValue(KV.first, Part);
3756       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3757         continue;
3758       Type *OriginalTy = I->getType();
3759       Type *ScalarTruncatedTy =
3760           IntegerType::get(OriginalTy->getContext(), KV.second);
3761       auto *TruncatedTy = FixedVectorType::get(
3762           ScalarTruncatedTy,
3763           cast<FixedVectorType>(OriginalTy)->getNumElements());
3764       if (TruncatedTy == OriginalTy)
3765         continue;
3766 
3767       IRBuilder<> B(cast<Instruction>(I));
3768       auto ShrinkOperand = [&](Value *V) -> Value * {
3769         if (auto *ZI = dyn_cast<ZExtInst>(V))
3770           if (ZI->getSrcTy() == TruncatedTy)
3771             return ZI->getOperand(0);
3772         return B.CreateZExtOrTrunc(V, TruncatedTy);
3773       };
3774 
3775       // The actual instruction modification depends on the instruction type,
3776       // unfortunately.
3777       Value *NewI = nullptr;
3778       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3779         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3780                              ShrinkOperand(BO->getOperand(1)));
3781 
3782         // Any wrapping introduced by shrinking this operation shouldn't be
3783         // considered undefined behavior. So, we can't unconditionally copy
3784         // arithmetic wrapping flags to NewI.
3785         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3786       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3787         NewI =
3788             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3789                          ShrinkOperand(CI->getOperand(1)));
3790       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3791         NewI = B.CreateSelect(SI->getCondition(),
3792                               ShrinkOperand(SI->getTrueValue()),
3793                               ShrinkOperand(SI->getFalseValue()));
3794       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3795         switch (CI->getOpcode()) {
3796         default:
3797           llvm_unreachable("Unhandled cast!");
3798         case Instruction::Trunc:
3799           NewI = ShrinkOperand(CI->getOperand(0));
3800           break;
3801         case Instruction::SExt:
3802           NewI = B.CreateSExtOrTrunc(
3803               CI->getOperand(0),
3804               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3805           break;
3806         case Instruction::ZExt:
3807           NewI = B.CreateZExtOrTrunc(
3808               CI->getOperand(0),
3809               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3810           break;
3811         }
3812       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3813         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3814                              ->getNumElements();
3815         auto *O0 = B.CreateZExtOrTrunc(
3816             SI->getOperand(0),
3817             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3818         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3819                              ->getNumElements();
3820         auto *O1 = B.CreateZExtOrTrunc(
3821             SI->getOperand(1),
3822             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3823 
3824         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3825       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3826         // Don't do anything with the operands, just extend the result.
3827         continue;
3828       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3829         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3830                             ->getNumElements();
3831         auto *O0 = B.CreateZExtOrTrunc(
3832             IE->getOperand(0),
3833             FixedVectorType::get(ScalarTruncatedTy, Elements));
3834         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3835         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3836       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3837         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3838                             ->getNumElements();
3839         auto *O0 = B.CreateZExtOrTrunc(
3840             EE->getOperand(0),
3841             FixedVectorType::get(ScalarTruncatedTy, Elements));
3842         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3843       } else {
3844         // If we don't know what to do, be conservative and don't do anything.
3845         continue;
3846       }
3847 
3848       // Lastly, extend the result.
3849       NewI->takeName(cast<Instruction>(I));
3850       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3851       I->replaceAllUsesWith(Res);
3852       cast<Instruction>(I)->eraseFromParent();
3853       Erased.insert(I);
3854       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3855     }
3856   }
3857 
3858   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3859   for (const auto &KV : Cost->getMinimalBitwidths()) {
3860     // If the value wasn't vectorized, we must maintain the original scalar
3861     // type. The absence of the value from VectorLoopValueMap indicates that it
3862     // wasn't vectorized.
3863     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3864       continue;
3865     for (unsigned Part = 0; Part < UF; ++Part) {
3866       Value *I = getOrCreateVectorValue(KV.first, Part);
3867       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3868       if (Inst && Inst->use_empty()) {
3869         Value *NewI = Inst->getOperand(0);
3870         Inst->eraseFromParent();
3871         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3872       }
3873     }
3874   }
3875 }
3876 
3877 void InnerLoopVectorizer::fixVectorizedLoop() {
3878   // Insert truncates and extends for any truncated instructions as hints to
3879   // InstCombine.
3880   if (VF.isVector())
3881     truncateToMinimalBitwidths();
3882 
3883   // Fix widened non-induction PHIs by setting up the PHI operands.
3884   if (OrigPHIsToFix.size()) {
3885     assert(EnableVPlanNativePath &&
3886            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3887     fixNonInductionPHIs();
3888   }
3889 
3890   // At this point every instruction in the original loop is widened to a
3891   // vector form. Now we need to fix the recurrences in the loop. These PHI
3892   // nodes are currently empty because we did not want to introduce cycles.
3893   // This is the second stage of vectorizing recurrences.
3894   fixCrossIterationPHIs();
3895 
3896   // Forget the original basic block.
3897   PSE.getSE()->forgetLoop(OrigLoop);
3898 
3899   // Fix-up external users of the induction variables.
3900   for (auto &Entry : Legal->getInductionVars())
3901     fixupIVUsers(Entry.first, Entry.second,
3902                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3903                  IVEndValues[Entry.first], LoopMiddleBlock);
3904 
3905   fixLCSSAPHIs();
3906   for (Instruction *PI : PredicatedInstructions)
3907     sinkScalarOperands(&*PI);
3908 
3909   // Remove redundant induction instructions.
3910   cse(LoopVectorBody);
3911 
3912   // Set/update profile weights for the vector and remainder loops as original
3913   // loop iterations are now distributed among them. Note that original loop
3914   // represented by LoopScalarBody becomes remainder loop after vectorization.
3915   //
3916   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3917   // end up getting slightly roughened result but that should be OK since
3918   // profile is not inherently precise anyway. Note also possible bypass of
3919   // vector code caused by legality checks is ignored, assigning all the weight
3920   // to the vector loop, optimistically.
3921   //
3922   // For scalable vectorization we can't know at compile time how many iterations
3923   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3924   // vscale of '1'.
3925   setProfileInfoAfterUnrolling(
3926       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3927       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3928 }
3929 
3930 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3931   // In order to support recurrences we need to be able to vectorize Phi nodes.
3932   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3933   // stage #2: We now need to fix the recurrences by adding incoming edges to
3934   // the currently empty PHI nodes. At this point every instruction in the
3935   // original loop is widened to a vector form so we can use them to construct
3936   // the incoming edges.
3937   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3938     // Handle first-order recurrences and reductions that need to be fixed.
3939     if (Legal->isFirstOrderRecurrence(&Phi))
3940       fixFirstOrderRecurrence(&Phi);
3941     else if (Legal->isReductionVariable(&Phi))
3942       fixReduction(&Phi);
3943   }
3944 }
3945 
3946 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3947   // This is the second phase of vectorizing first-order recurrences. An
3948   // overview of the transformation is described below. Suppose we have the
3949   // following loop.
3950   //
3951   //   for (int i = 0; i < n; ++i)
3952   //     b[i] = a[i] - a[i - 1];
3953   //
3954   // There is a first-order recurrence on "a". For this loop, the shorthand
3955   // scalar IR looks like:
3956   //
3957   //   scalar.ph:
3958   //     s_init = a[-1]
3959   //     br scalar.body
3960   //
3961   //   scalar.body:
3962   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3963   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3964   //     s2 = a[i]
3965   //     b[i] = s2 - s1
3966   //     br cond, scalar.body, ...
3967   //
3968   // In this example, s1 is a recurrence because it's value depends on the
3969   // previous iteration. In the first phase of vectorization, we created a
3970   // temporary value for s1. We now complete the vectorization and produce the
3971   // shorthand vector IR shown below (for VF = 4, UF = 1).
3972   //
3973   //   vector.ph:
3974   //     v_init = vector(..., ..., ..., a[-1])
3975   //     br vector.body
3976   //
3977   //   vector.body
3978   //     i = phi [0, vector.ph], [i+4, vector.body]
3979   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3980   //     v2 = a[i, i+1, i+2, i+3];
3981   //     v3 = vector(v1(3), v2(0, 1, 2))
3982   //     b[i, i+1, i+2, i+3] = v2 - v3
3983   //     br cond, vector.body, middle.block
3984   //
3985   //   middle.block:
3986   //     x = v2(3)
3987   //     br scalar.ph
3988   //
3989   //   scalar.ph:
3990   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3991   //     br scalar.body
3992   //
3993   // After execution completes the vector loop, we extract the next value of
3994   // the recurrence (x) to use as the initial value in the scalar loop.
3995 
3996   // Get the original loop preheader and single loop latch.
3997   auto *Preheader = OrigLoop->getLoopPreheader();
3998   auto *Latch = OrigLoop->getLoopLatch();
3999 
4000   // Get the initial and previous values of the scalar recurrence.
4001   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4002   auto *Previous = Phi->getIncomingValueForBlock(Latch);
4003 
4004   // Create a vector from the initial value.
4005   auto *VectorInit = ScalarInit;
4006   if (VF.isVector()) {
4007     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4008     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4009     VectorInit = Builder.CreateInsertElement(
4010         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4011         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
4012   }
4013 
4014   // We constructed a temporary phi node in the first phase of vectorization.
4015   // This phi node will eventually be deleted.
4016   Builder.SetInsertPoint(
4017       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
4018 
4019   // Create a phi node for the new recurrence. The current value will either be
4020   // the initial value inserted into a vector or loop-varying vector value.
4021   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4022   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4023 
4024   // Get the vectorized previous value of the last part UF - 1. It appears last
4025   // among all unrolled iterations, due to the order of their construction.
4026   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
4027 
4028   // Find and set the insertion point after the previous value if it is an
4029   // instruction.
4030   BasicBlock::iterator InsertPt;
4031   // Note that the previous value may have been constant-folded so it is not
4032   // guaranteed to be an instruction in the vector loop.
4033   // FIXME: Loop invariant values do not form recurrences. We should deal with
4034   //        them earlier.
4035   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4036     InsertPt = LoopVectorBody->getFirstInsertionPt();
4037   else {
4038     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4039     if (isa<PHINode>(PreviousLastPart))
4040       // If the previous value is a phi node, we should insert after all the phi
4041       // nodes in the block containing the PHI to avoid breaking basic block
4042       // verification. Note that the basic block may be different to
4043       // LoopVectorBody, in case we predicate the loop.
4044       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4045     else
4046       InsertPt = ++PreviousInst->getIterator();
4047   }
4048   Builder.SetInsertPoint(&*InsertPt);
4049 
4050   // We will construct a vector for the recurrence by combining the values for
4051   // the current and previous iterations. This is the required shuffle mask.
4052   assert(!VF.isScalable());
4053   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4054   ShuffleMask[0] = VF.getKnownMinValue() - 1;
4055   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4056     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4057 
4058   // The vector from which to take the initial value for the current iteration
4059   // (actual or unrolled). Initially, this is the vector phi node.
4060   Value *Incoming = VecPhi;
4061 
4062   // Shuffle the current and previous vector and update the vector parts.
4063   for (unsigned Part = 0; Part < UF; ++Part) {
4064     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
4065     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
4066     auto *Shuffle =
4067         VF.isVector()
4068             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4069             : Incoming;
4070     PhiPart->replaceAllUsesWith(Shuffle);
4071     cast<Instruction>(PhiPart)->eraseFromParent();
4072     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
4073     Incoming = PreviousPart;
4074   }
4075 
4076   // Fix the latch value of the new recurrence in the vector loop.
4077   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4078 
4079   // Extract the last vector element in the middle block. This will be the
4080   // initial value for the recurrence when jumping to the scalar loop.
4081   auto *ExtractForScalar = Incoming;
4082   if (VF.isVector()) {
4083     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4084     ExtractForScalar = Builder.CreateExtractElement(
4085         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4086         "vector.recur.extract");
4087   }
4088   // Extract the second last element in the middle block if the
4089   // Phi is used outside the loop. We need to extract the phi itself
4090   // and not the last element (the phi update in the current iteration). This
4091   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4092   // when the scalar loop is not run at all.
4093   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4094   if (VF.isVector())
4095     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4096         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4097         "vector.recur.extract.for.phi");
4098   // When loop is unrolled without vectorizing, initialize
4099   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4100   // `Incoming`. This is analogous to the vectorized case above: extracting the
4101   // second last element when VF > 1.
4102   else if (UF > 1)
4103     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
4104 
4105   // Fix the initial value of the original recurrence in the scalar loop.
4106   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4107   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4108   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4109     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4110     Start->addIncoming(Incoming, BB);
4111   }
4112 
4113   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4114   Phi->setName("scalar.recur");
4115 
4116   // Finally, fix users of the recurrence outside the loop. The users will need
4117   // either the last value of the scalar recurrence or the last value of the
4118   // vector recurrence we extracted in the middle block. Since the loop is in
4119   // LCSSA form, we just need to find all the phi nodes for the original scalar
4120   // recurrence in the exit block, and then add an edge for the middle block.
4121   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4122     if (LCSSAPhi.getIncomingValue(0) == Phi) {
4123       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4124     }
4125   }
4126 }
4127 
4128 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
4129   Constant *Zero = Builder.getInt32(0);
4130 
4131   // Get it's reduction variable descriptor.
4132   assert(Legal->isReductionVariable(Phi) &&
4133          "Unable to find the reduction variable");
4134   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4135 
4136   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4137   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4138   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4139   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
4140     RdxDesc.getMinMaxRecurrenceKind();
4141   setDebugLocFromInst(Builder, ReductionStartValue);
4142   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4143 
4144   // We need to generate a reduction vector from the incoming scalar.
4145   // To do so, we need to generate the 'identity' vector and override
4146   // one of the elements with the incoming scalar reduction. We need
4147   // to do it in the vector-loop preheader.
4148   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4149 
4150   // This is the vector-clone of the value that leaves the loop.
4151   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
4152 
4153   // Find the reduction identity variable. Zero for addition, or, xor,
4154   // one for multiplication, -1 for And.
4155   Value *Identity;
4156   Value *VectorStart;
4157   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
4158       RK == RecurrenceDescriptor::RK_FloatMinMax) {
4159     // MinMax reduction have the start value as their identify.
4160     if (VF.isScalar() || IsInLoopReductionPhi) {
4161       VectorStart = Identity = ReductionStartValue;
4162     } else {
4163       VectorStart = Identity =
4164         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
4165     }
4166   } else {
4167     // Handle other reduction kinds:
4168     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
4169         RK, MinMaxKind, VecTy->getScalarType());
4170     if (VF.isScalar() || IsInLoopReductionPhi) {
4171       Identity = Iden;
4172       // This vector is the Identity vector where the first element is the
4173       // incoming scalar reduction.
4174       VectorStart = ReductionStartValue;
4175     } else {
4176       Identity = ConstantVector::getSplat(VF, Iden);
4177 
4178       // This vector is the Identity vector where the first element is the
4179       // incoming scalar reduction.
4180       VectorStart =
4181         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
4182     }
4183   }
4184 
4185   // Wrap flags are in general invalid after vectorization, clear them.
4186   clearReductionWrapFlags(RdxDesc);
4187 
4188   // Fix the vector-loop phi.
4189 
4190   // Reductions do not have to start at zero. They can start with
4191   // any loop invariant values.
4192   BasicBlock *Latch = OrigLoop->getLoopLatch();
4193   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4194 
4195   for (unsigned Part = 0; Part < UF; ++Part) {
4196     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
4197     Value *Val = getOrCreateVectorValue(LoopVal, Part);
4198     // Make sure to add the reduction start value only to the
4199     // first unroll part.
4200     Value *StartVal = (Part == 0) ? VectorStart : Identity;
4201     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
4202     cast<PHINode>(VecRdxPhi)
4203       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4204   }
4205 
4206   // Before each round, move the insertion point right between
4207   // the PHIs and the values we are going to write.
4208   // This allows us to write both PHINodes and the extractelement
4209   // instructions.
4210   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4211 
4212   setDebugLocFromInst(Builder, LoopExitInst);
4213 
4214   // If tail is folded by masking, the vector value to leave the loop should be
4215   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4216   // instead of the former. For an inloop reduction the reduction will already
4217   // be predicated, and does not need to be handled here.
4218   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4219     for (unsigned Part = 0; Part < UF; ++Part) {
4220       Value *VecLoopExitInst =
4221           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4222       Value *Sel = nullptr;
4223       for (User *U : VecLoopExitInst->users()) {
4224         if (isa<SelectInst>(U)) {
4225           assert(!Sel && "Reduction exit feeding two selects");
4226           Sel = U;
4227         } else
4228           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4229       }
4230       assert(Sel && "Reduction exit feeds no select");
4231       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
4232 
4233       // If the target can create a predicated operator for the reduction at no
4234       // extra cost in the loop (for example a predicated vadd), it can be
4235       // cheaper for the select to remain in the loop than be sunk out of it,
4236       // and so use the select value for the phi instead of the old
4237       // LoopExitValue.
4238       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4239       if (PreferPredicatedReductionSelect ||
4240           TTI->preferPredicatedReductionSelect(
4241               RdxDesc.getRecurrenceBinOp(), Phi->getType(),
4242               TargetTransformInfo::ReductionFlags())) {
4243         auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
4244         VecRdxPhi->setIncomingValueForBlock(
4245             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4246       }
4247     }
4248   }
4249 
4250   // If the vector reduction can be performed in a smaller type, we truncate
4251   // then extend the loop exit value to enable InstCombine to evaluate the
4252   // entire expression in the smaller type.
4253   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4254     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4255     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4256     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4257     Builder.SetInsertPoint(
4258         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4259     VectorParts RdxParts(UF);
4260     for (unsigned Part = 0; Part < UF; ++Part) {
4261       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4262       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4263       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4264                                         : Builder.CreateZExt(Trunc, VecTy);
4265       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4266            UI != RdxParts[Part]->user_end();)
4267         if (*UI != Trunc) {
4268           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4269           RdxParts[Part] = Extnd;
4270         } else {
4271           ++UI;
4272         }
4273     }
4274     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4275     for (unsigned Part = 0; Part < UF; ++Part) {
4276       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4277       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
4278     }
4279   }
4280 
4281   // Reduce all of the unrolled parts into a single vector.
4282   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
4283   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
4284 
4285   // The middle block terminator has already been assigned a DebugLoc here (the
4286   // OrigLoop's single latch terminator). We want the whole middle block to
4287   // appear to execute on this line because: (a) it is all compiler generated,
4288   // (b) these instructions are always executed after evaluating the latch
4289   // conditional branch, and (c) other passes may add new predecessors which
4290   // terminate on this line. This is the easiest way to ensure we don't
4291   // accidentally cause an extra step back into the loop while debugging.
4292   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4293   for (unsigned Part = 1; Part < UF; ++Part) {
4294     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
4295     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
4296       // Floating point operations had to be 'fast' to enable the reduction.
4297       ReducedPartRdx = addFastMathFlag(
4298           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
4299                               ReducedPartRdx, "bin.rdx"),
4300           RdxDesc.getFastMathFlags());
4301     else
4302       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
4303                                       RdxPart);
4304   }
4305 
4306   // Create the reduction after the loop. Note that inloop reductions create the
4307   // target reduction in the loop using a Reduction recipe.
4308   if (VF.isVector() && !IsInLoopReductionPhi) {
4309     bool NoNaN = Legal->hasFunNoNaNAttr();
4310     ReducedPartRdx =
4311         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
4312     // If the reduction can be performed in a smaller type, we need to extend
4313     // the reduction to the wider type before we branch to the original loop.
4314     if (Phi->getType() != RdxDesc.getRecurrenceType())
4315       ReducedPartRdx =
4316         RdxDesc.isSigned()
4317         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4318         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4319   }
4320 
4321   // Create a phi node that merges control-flow from the backedge-taken check
4322   // block and the middle block.
4323   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4324                                         LoopScalarPreHeader->getTerminator());
4325   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4326     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4327   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4328 
4329   // Now, we need to fix the users of the reduction variable
4330   // inside and outside of the scalar remainder loop.
4331   // We know that the loop is in LCSSA form. We need to update the
4332   // PHI nodes in the exit blocks.
4333   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4334     // All PHINodes need to have a single entry edge, or two if
4335     // we already fixed them.
4336     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
4337 
4338     // We found a reduction value exit-PHI. Update it with the
4339     // incoming bypass edge.
4340     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
4341       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4342   } // end of the LCSSA phi scan.
4343 
4344     // Fix the scalar loop reduction variable with the incoming reduction sum
4345     // from the vector body and from the backedge value.
4346   int IncomingEdgeBlockIdx =
4347     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4348   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4349   // Pick the other block.
4350   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4351   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4352   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4353 }
4354 
4355 void InnerLoopVectorizer::clearReductionWrapFlags(
4356     RecurrenceDescriptor &RdxDesc) {
4357   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
4358   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
4359       RK != RecurrenceDescriptor::RK_IntegerMult)
4360     return;
4361 
4362   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4363   assert(LoopExitInstr && "null loop exit instruction");
4364   SmallVector<Instruction *, 8> Worklist;
4365   SmallPtrSet<Instruction *, 8> Visited;
4366   Worklist.push_back(LoopExitInstr);
4367   Visited.insert(LoopExitInstr);
4368 
4369   while (!Worklist.empty()) {
4370     Instruction *Cur = Worklist.pop_back_val();
4371     if (isa<OverflowingBinaryOperator>(Cur))
4372       for (unsigned Part = 0; Part < UF; ++Part) {
4373         Value *V = getOrCreateVectorValue(Cur, Part);
4374         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4375       }
4376 
4377     for (User *U : Cur->users()) {
4378       Instruction *UI = cast<Instruction>(U);
4379       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4380           Visited.insert(UI).second)
4381         Worklist.push_back(UI);
4382     }
4383   }
4384 }
4385 
4386 void InnerLoopVectorizer::fixLCSSAPHIs() {
4387   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4388     if (LCSSAPhi.getNumIncomingValues() == 1) {
4389       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4390       // Non-instruction incoming values will have only one value.
4391       unsigned LastLane = 0;
4392       if (isa<Instruction>(IncomingValue))
4393         LastLane = Cost->isUniformAfterVectorization(
4394                        cast<Instruction>(IncomingValue), VF)
4395                        ? 0
4396                        : VF.getKnownMinValue() - 1;
4397       assert((!VF.isScalable() || LastLane == 0) &&
4398              "scalable vectors dont support non-uniform scalars yet");
4399       // Can be a loop invariant incoming value or the last scalar value to be
4400       // extracted from the vectorized loop.
4401       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4402       Value *lastIncomingValue =
4403           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
4404       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4405     }
4406   }
4407 }
4408 
4409 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4410   // The basic block and loop containing the predicated instruction.
4411   auto *PredBB = PredInst->getParent();
4412   auto *VectorLoop = LI->getLoopFor(PredBB);
4413 
4414   // Initialize a worklist with the operands of the predicated instruction.
4415   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4416 
4417   // Holds instructions that we need to analyze again. An instruction may be
4418   // reanalyzed if we don't yet know if we can sink it or not.
4419   SmallVector<Instruction *, 8> InstsToReanalyze;
4420 
4421   // Returns true if a given use occurs in the predicated block. Phi nodes use
4422   // their operands in their corresponding predecessor blocks.
4423   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4424     auto *I = cast<Instruction>(U.getUser());
4425     BasicBlock *BB = I->getParent();
4426     if (auto *Phi = dyn_cast<PHINode>(I))
4427       BB = Phi->getIncomingBlock(
4428           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4429     return BB == PredBB;
4430   };
4431 
4432   // Iteratively sink the scalarized operands of the predicated instruction
4433   // into the block we created for it. When an instruction is sunk, it's
4434   // operands are then added to the worklist. The algorithm ends after one pass
4435   // through the worklist doesn't sink a single instruction.
4436   bool Changed;
4437   do {
4438     // Add the instructions that need to be reanalyzed to the worklist, and
4439     // reset the changed indicator.
4440     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4441     InstsToReanalyze.clear();
4442     Changed = false;
4443 
4444     while (!Worklist.empty()) {
4445       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4446 
4447       // We can't sink an instruction if it is a phi node, is already in the
4448       // predicated block, is not in the loop, or may have side effects.
4449       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4450           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4451         continue;
4452 
4453       // It's legal to sink the instruction if all its uses occur in the
4454       // predicated block. Otherwise, there's nothing to do yet, and we may
4455       // need to reanalyze the instruction.
4456       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4457         InstsToReanalyze.push_back(I);
4458         continue;
4459       }
4460 
4461       // Move the instruction to the beginning of the predicated block, and add
4462       // it's operands to the worklist.
4463       I->moveBefore(&*PredBB->getFirstInsertionPt());
4464       Worklist.insert(I->op_begin(), I->op_end());
4465 
4466       // The sinking may have enabled other instructions to be sunk, so we will
4467       // need to iterate.
4468       Changed = true;
4469     }
4470   } while (Changed);
4471 }
4472 
4473 void InnerLoopVectorizer::fixNonInductionPHIs() {
4474   for (PHINode *OrigPhi : OrigPHIsToFix) {
4475     PHINode *NewPhi =
4476         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4477     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4478 
4479     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4480         predecessors(OrigPhi->getParent()));
4481     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4482         predecessors(NewPhi->getParent()));
4483     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4484            "Scalar and Vector BB should have the same number of predecessors");
4485 
4486     // The insertion point in Builder may be invalidated by the time we get
4487     // here. Force the Builder insertion point to something valid so that we do
4488     // not run into issues during insertion point restore in
4489     // getOrCreateVectorValue calls below.
4490     Builder.SetInsertPoint(NewPhi);
4491 
4492     // The predecessor order is preserved and we can rely on mapping between
4493     // scalar and vector block predecessors.
4494     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4495       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4496 
4497       // When looking up the new scalar/vector values to fix up, use incoming
4498       // values from original phi.
4499       Value *ScIncV =
4500           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4501 
4502       // Scalar incoming value may need a broadcast
4503       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4504       NewPhi->addIncoming(NewIncV, NewPredBB);
4505     }
4506   }
4507 }
4508 
4509 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4510                                    VPUser &Operands, unsigned UF,
4511                                    ElementCount VF, bool IsPtrLoopInvariant,
4512                                    SmallBitVector &IsIndexLoopInvariant,
4513                                    VPTransformState &State) {
4514   // Construct a vector GEP by widening the operands of the scalar GEP as
4515   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4516   // results in a vector of pointers when at least one operand of the GEP
4517   // is vector-typed. Thus, to keep the representation compact, we only use
4518   // vector-typed operands for loop-varying values.
4519 
4520   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4521     // If we are vectorizing, but the GEP has only loop-invariant operands,
4522     // the GEP we build (by only using vector-typed operands for
4523     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4524     // produce a vector of pointers, we need to either arbitrarily pick an
4525     // operand to broadcast, or broadcast a clone of the original GEP.
4526     // Here, we broadcast a clone of the original.
4527     //
4528     // TODO: If at some point we decide to scalarize instructions having
4529     //       loop-invariant operands, this special case will no longer be
4530     //       required. We would add the scalarization decision to
4531     //       collectLoopScalars() and teach getVectorValue() to broadcast
4532     //       the lane-zero scalar value.
4533     auto *Clone = Builder.Insert(GEP->clone());
4534     for (unsigned Part = 0; Part < UF; ++Part) {
4535       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4536       State.set(VPDef, GEP, EntryPart, Part);
4537       addMetadata(EntryPart, GEP);
4538     }
4539   } else {
4540     // If the GEP has at least one loop-varying operand, we are sure to
4541     // produce a vector of pointers. But if we are only unrolling, we want
4542     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4543     // produce with the code below will be scalar (if VF == 1) or vector
4544     // (otherwise). Note that for the unroll-only case, we still maintain
4545     // values in the vector mapping with initVector, as we do for other
4546     // instructions.
4547     for (unsigned Part = 0; Part < UF; ++Part) {
4548       // The pointer operand of the new GEP. If it's loop-invariant, we
4549       // won't broadcast it.
4550       auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
4551                                      : State.get(Operands.getOperand(0), Part);
4552 
4553       // Collect all the indices for the new GEP. If any index is
4554       // loop-invariant, we won't broadcast it.
4555       SmallVector<Value *, 4> Indices;
4556       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4557         VPValue *Operand = Operands.getOperand(I);
4558         if (IsIndexLoopInvariant[I - 1])
4559           Indices.push_back(State.get(Operand, {0, 0}));
4560         else
4561           Indices.push_back(State.get(Operand, Part));
4562       }
4563 
4564       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4565       // but it should be a vector, otherwise.
4566       auto *NewGEP =
4567           GEP->isInBounds()
4568               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4569                                           Indices)
4570               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4571       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4572              "NewGEP is not a pointer vector");
4573       State.set(VPDef, GEP, NewGEP, Part);
4574       addMetadata(NewGEP, GEP);
4575     }
4576   }
4577 }
4578 
4579 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4580                                               ElementCount VF) {
4581   assert(!VF.isScalable() && "scalable vectors not yet supported.");
4582   PHINode *P = cast<PHINode>(PN);
4583   if (EnableVPlanNativePath) {
4584     // Currently we enter here in the VPlan-native path for non-induction
4585     // PHIs where all control flow is uniform. We simply widen these PHIs.
4586     // Create a vector phi with no operands - the vector phi operands will be
4587     // set at the end of vector code generation.
4588     Type *VecTy =
4589         (VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
4590     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4591     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4592     OrigPHIsToFix.push_back(P);
4593 
4594     return;
4595   }
4596 
4597   assert(PN->getParent() == OrigLoop->getHeader() &&
4598          "Non-header phis should have been handled elsewhere");
4599 
4600   // In order to support recurrences we need to be able to vectorize Phi nodes.
4601   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4602   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4603   // this value when we vectorize all of the instructions that use the PHI.
4604   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4605     for (unsigned Part = 0; Part < UF; ++Part) {
4606       // This is phase one of vectorizing PHIs.
4607       bool ScalarPHI =
4608           (VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4609       Type *VecTy =
4610           ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), VF);
4611       Value *EntryPart = PHINode::Create(
4612           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4613       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4614     }
4615     return;
4616   }
4617 
4618   setDebugLocFromInst(Builder, P);
4619 
4620   // This PHINode must be an induction variable.
4621   // Make sure that we know about it.
4622   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4623 
4624   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4625   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4626 
4627   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4628   // which can be found from the original scalar operations.
4629   switch (II.getKind()) {
4630   case InductionDescriptor::IK_NoInduction:
4631     llvm_unreachable("Unknown induction");
4632   case InductionDescriptor::IK_IntInduction:
4633   case InductionDescriptor::IK_FpInduction:
4634     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4635   case InductionDescriptor::IK_PtrInduction: {
4636     // Handle the pointer induction variable case.
4637     assert(P->getType()->isPointerTy() && "Unexpected type.");
4638 
4639     if (Cost->isScalarAfterVectorization(P, VF)) {
4640       // This is the normalized GEP that starts counting at zero.
4641       Value *PtrInd =
4642           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4643       // Determine the number of scalars we need to generate for each unroll
4644       // iteration. If the instruction is uniform, we only need to generate the
4645       // first lane. Otherwise, we generate all VF values.
4646       unsigned Lanes =
4647           Cost->isUniformAfterVectorization(P, VF) ? 1 : VF.getKnownMinValue();
4648       for (unsigned Part = 0; Part < UF; ++Part) {
4649         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4650           Constant *Idx = ConstantInt::get(PtrInd->getType(),
4651                                            Lane + Part * VF.getKnownMinValue());
4652           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4653           Value *SclrGep =
4654               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4655           SclrGep->setName("next.gep");
4656           VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4657         }
4658       }
4659       return;
4660     }
4661     assert(isa<SCEVConstant>(II.getStep()) &&
4662            "Induction step not a SCEV constant!");
4663     Type *PhiType = II.getStep()->getType();
4664 
4665     // Build a pointer phi
4666     Value *ScalarStartValue = II.getStartValue();
4667     Type *ScStValueType = ScalarStartValue->getType();
4668     PHINode *NewPointerPhi =
4669         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4670     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4671 
4672     // A pointer induction, performed by using a gep
4673     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4674     Instruction *InductionLoc = LoopLatch->getTerminator();
4675     const SCEV *ScalarStep = II.getStep();
4676     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4677     Value *ScalarStepValue =
4678         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4679     Value *InductionGEP = GetElementPtrInst::Create(
4680         ScStValueType->getPointerElementType(), NewPointerPhi,
4681         Builder.CreateMul(
4682             ScalarStepValue,
4683             ConstantInt::get(PhiType, VF.getKnownMinValue() * UF)),
4684         "ptr.ind", InductionLoc);
4685     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4686 
4687     // Create UF many actual address geps that use the pointer
4688     // phi as base and a vectorized version of the step value
4689     // (<step*0, ..., step*N>) as offset.
4690     for (unsigned Part = 0; Part < UF; ++Part) {
4691       SmallVector<Constant *, 8> Indices;
4692       // Create a vector of consecutive numbers from zero to VF.
4693       for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
4694         Indices.push_back(
4695             ConstantInt::get(PhiType, i + Part * VF.getKnownMinValue()));
4696       Constant *StartOffset = ConstantVector::get(Indices);
4697 
4698       Value *GEP = Builder.CreateGEP(
4699           ScStValueType->getPointerElementType(), NewPointerPhi,
4700           Builder.CreateMul(
4701               StartOffset,
4702               Builder.CreateVectorSplat(VF.getKnownMinValue(), ScalarStepValue),
4703               "vector.gep"));
4704       VectorLoopValueMap.setVectorValue(P, Part, GEP);
4705     }
4706   }
4707   }
4708 }
4709 
4710 /// A helper function for checking whether an integer division-related
4711 /// instruction may divide by zero (in which case it must be predicated if
4712 /// executed conditionally in the scalar code).
4713 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4714 /// Non-zero divisors that are non compile-time constants will not be
4715 /// converted into multiplication, so we will still end up scalarizing
4716 /// the division, but can do so w/o predication.
4717 static bool mayDivideByZero(Instruction &I) {
4718   assert((I.getOpcode() == Instruction::UDiv ||
4719           I.getOpcode() == Instruction::SDiv ||
4720           I.getOpcode() == Instruction::URem ||
4721           I.getOpcode() == Instruction::SRem) &&
4722          "Unexpected instruction");
4723   Value *Divisor = I.getOperand(1);
4724   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4725   return !CInt || CInt->isZero();
4726 }
4727 
4728 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4729                                            VPUser &User,
4730                                            VPTransformState &State) {
4731   switch (I.getOpcode()) {
4732   case Instruction::Call:
4733   case Instruction::Br:
4734   case Instruction::PHI:
4735   case Instruction::GetElementPtr:
4736   case Instruction::Select:
4737     llvm_unreachable("This instruction is handled by a different recipe.");
4738   case Instruction::UDiv:
4739   case Instruction::SDiv:
4740   case Instruction::SRem:
4741   case Instruction::URem:
4742   case Instruction::Add:
4743   case Instruction::FAdd:
4744   case Instruction::Sub:
4745   case Instruction::FSub:
4746   case Instruction::FNeg:
4747   case Instruction::Mul:
4748   case Instruction::FMul:
4749   case Instruction::FDiv:
4750   case Instruction::FRem:
4751   case Instruction::Shl:
4752   case Instruction::LShr:
4753   case Instruction::AShr:
4754   case Instruction::And:
4755   case Instruction::Or:
4756   case Instruction::Xor: {
4757     // Just widen unops and binops.
4758     setDebugLocFromInst(Builder, &I);
4759 
4760     for (unsigned Part = 0; Part < UF; ++Part) {
4761       SmallVector<Value *, 2> Ops;
4762       for (VPValue *VPOp : User.operands())
4763         Ops.push_back(State.get(VPOp, Part));
4764 
4765       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4766 
4767       if (auto *VecOp = dyn_cast<Instruction>(V))
4768         VecOp->copyIRFlags(&I);
4769 
4770       // Use this vector value for all users of the original instruction.
4771       State.set(Def, &I, V, Part);
4772       addMetadata(V, &I);
4773     }
4774 
4775     break;
4776   }
4777   case Instruction::ICmp:
4778   case Instruction::FCmp: {
4779     // Widen compares. Generate vector compares.
4780     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4781     auto *Cmp = cast<CmpInst>(&I);
4782     setDebugLocFromInst(Builder, Cmp);
4783     for (unsigned Part = 0; Part < UF; ++Part) {
4784       Value *A = State.get(User.getOperand(0), Part);
4785       Value *B = State.get(User.getOperand(1), Part);
4786       Value *C = nullptr;
4787       if (FCmp) {
4788         // Propagate fast math flags.
4789         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4790         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4791         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4792       } else {
4793         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4794       }
4795       State.set(Def, &I, C, Part);
4796       addMetadata(C, &I);
4797     }
4798 
4799     break;
4800   }
4801 
4802   case Instruction::ZExt:
4803   case Instruction::SExt:
4804   case Instruction::FPToUI:
4805   case Instruction::FPToSI:
4806   case Instruction::FPExt:
4807   case Instruction::PtrToInt:
4808   case Instruction::IntToPtr:
4809   case Instruction::SIToFP:
4810   case Instruction::UIToFP:
4811   case Instruction::Trunc:
4812   case Instruction::FPTrunc:
4813   case Instruction::BitCast: {
4814     auto *CI = cast<CastInst>(&I);
4815     setDebugLocFromInst(Builder, CI);
4816 
4817     /// Vectorize casts.
4818     Type *DestTy =
4819         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4820 
4821     for (unsigned Part = 0; Part < UF; ++Part) {
4822       Value *A = State.get(User.getOperand(0), Part);
4823       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4824       State.set(Def, &I, Cast, Part);
4825       addMetadata(Cast, &I);
4826     }
4827     break;
4828   }
4829   default:
4830     // This instruction is not vectorized by simple widening.
4831     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4832     llvm_unreachable("Unhandled instruction!");
4833   } // end of switch.
4834 }
4835 
4836 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4837                                                VPUser &ArgOperands,
4838                                                VPTransformState &State) {
4839   assert(!isa<DbgInfoIntrinsic>(I) &&
4840          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4841   setDebugLocFromInst(Builder, &I);
4842 
4843   Module *M = I.getParent()->getParent()->getParent();
4844   auto *CI = cast<CallInst>(&I);
4845 
4846   SmallVector<Type *, 4> Tys;
4847   for (Value *ArgOperand : CI->arg_operands())
4848     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4849 
4850   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4851 
4852   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4853   // version of the instruction.
4854   // Is it beneficial to perform intrinsic call compared to lib call?
4855   bool NeedToScalarize = false;
4856   unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4857   bool UseVectorIntrinsic =
4858       ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4859   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4860          "Instruction should be scalarized elsewhere.");
4861 
4862   for (unsigned Part = 0; Part < UF; ++Part) {
4863     SmallVector<Value *, 4> Args;
4864     for (auto &I : enumerate(ArgOperands.operands())) {
4865       // Some intrinsics have a scalar argument - don't replace it with a
4866       // vector.
4867       Value *Arg;
4868       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4869         Arg = State.get(I.value(), Part);
4870       else
4871         Arg = State.get(I.value(), {0, 0});
4872       Args.push_back(Arg);
4873     }
4874 
4875     Function *VectorF;
4876     if (UseVectorIntrinsic) {
4877       // Use vector version of the intrinsic.
4878       Type *TysForDecl[] = {CI->getType()};
4879       if (VF.isVector()) {
4880         assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4881         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4882       }
4883       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4884       assert(VectorF && "Can't retrieve vector intrinsic.");
4885     } else {
4886       // Use vector version of the function call.
4887       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4888 #ifndef NDEBUG
4889       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4890              "Can't create vector function.");
4891 #endif
4892         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4893     }
4894       SmallVector<OperandBundleDef, 1> OpBundles;
4895       CI->getOperandBundlesAsDefs(OpBundles);
4896       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4897 
4898       if (isa<FPMathOperator>(V))
4899         V->copyFastMathFlags(CI);
4900 
4901       State.set(Def, &I, V, Part);
4902       addMetadata(V, &I);
4903   }
4904 }
4905 
4906 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4907                                                  VPUser &Operands,
4908                                                  bool InvariantCond,
4909                                                  VPTransformState &State) {
4910   setDebugLocFromInst(Builder, &I);
4911 
4912   // The condition can be loop invariant  but still defined inside the
4913   // loop. This means that we can't just use the original 'cond' value.
4914   // We have to take the 'vectorized' value and pick the first lane.
4915   // Instcombine will make this a no-op.
4916   auto *InvarCond =
4917       InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
4918 
4919   for (unsigned Part = 0; Part < UF; ++Part) {
4920     Value *Cond =
4921         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
4922     Value *Op0 = State.get(Operands.getOperand(1), Part);
4923     Value *Op1 = State.get(Operands.getOperand(2), Part);
4924     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
4925     State.set(VPDef, &I, Sel, Part);
4926     addMetadata(Sel, &I);
4927   }
4928 }
4929 
4930 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4931   // We should not collect Scalars more than once per VF. Right now, this
4932   // function is called from collectUniformsAndScalars(), which already does
4933   // this check. Collecting Scalars for VF=1 does not make any sense.
4934   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4935          "This function should not be visited twice for the same VF");
4936 
4937   SmallSetVector<Instruction *, 8> Worklist;
4938 
4939   // These sets are used to seed the analysis with pointers used by memory
4940   // accesses that will remain scalar.
4941   SmallSetVector<Instruction *, 8> ScalarPtrs;
4942   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4943   auto *Latch = TheLoop->getLoopLatch();
4944 
4945   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4946   // The pointer operands of loads and stores will be scalar as long as the
4947   // memory access is not a gather or scatter operation. The value operand of a
4948   // store will remain scalar if the store is scalarized.
4949   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4950     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4951     assert(WideningDecision != CM_Unknown &&
4952            "Widening decision should be ready at this moment");
4953     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4954       if (Ptr == Store->getValueOperand())
4955         return WideningDecision == CM_Scalarize;
4956     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4957            "Ptr is neither a value or pointer operand");
4958     return WideningDecision != CM_GatherScatter;
4959   };
4960 
4961   // A helper that returns true if the given value is a bitcast or
4962   // getelementptr instruction contained in the loop.
4963   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4964     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4965             isa<GetElementPtrInst>(V)) &&
4966            !TheLoop->isLoopInvariant(V);
4967   };
4968 
4969   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
4970     if (!isa<PHINode>(Ptr) ||
4971         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
4972       return false;
4973     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
4974     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
4975       return false;
4976     return isScalarUse(MemAccess, Ptr);
4977   };
4978 
4979   // A helper that evaluates a memory access's use of a pointer. If the
4980   // pointer is actually the pointer induction of a loop, it is being
4981   // inserted into Worklist. If the use will be a scalar use, and the
4982   // pointer is only used by memory accesses, we place the pointer in
4983   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
4984   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4985     if (isScalarPtrInduction(MemAccess, Ptr)) {
4986       Worklist.insert(cast<Instruction>(Ptr));
4987       Instruction *Update = cast<Instruction>(
4988           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
4989       Worklist.insert(Update);
4990       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
4991                         << "\n");
4992       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
4993                         << "\n");
4994       return;
4995     }
4996     // We only care about bitcast and getelementptr instructions contained in
4997     // the loop.
4998     if (!isLoopVaryingBitCastOrGEP(Ptr))
4999       return;
5000 
5001     // If the pointer has already been identified as scalar (e.g., if it was
5002     // also identified as uniform), there's nothing to do.
5003     auto *I = cast<Instruction>(Ptr);
5004     if (Worklist.count(I))
5005       return;
5006 
5007     // If the use of the pointer will be a scalar use, and all users of the
5008     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5009     // place the pointer in PossibleNonScalarPtrs.
5010     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5011           return isa<LoadInst>(U) || isa<StoreInst>(U);
5012         }))
5013       ScalarPtrs.insert(I);
5014     else
5015       PossibleNonScalarPtrs.insert(I);
5016   };
5017 
5018   // We seed the scalars analysis with three classes of instructions: (1)
5019   // instructions marked uniform-after-vectorization and (2) bitcast,
5020   // getelementptr and (pointer) phi instructions used by memory accesses
5021   // requiring a scalar use.
5022   //
5023   // (1) Add to the worklist all instructions that have been identified as
5024   // uniform-after-vectorization.
5025   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5026 
5027   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5028   // memory accesses requiring a scalar use. The pointer operands of loads and
5029   // stores will be scalar as long as the memory accesses is not a gather or
5030   // scatter operation. The value operand of a store will remain scalar if the
5031   // store is scalarized.
5032   for (auto *BB : TheLoop->blocks())
5033     for (auto &I : *BB) {
5034       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5035         evaluatePtrUse(Load, Load->getPointerOperand());
5036       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5037         evaluatePtrUse(Store, Store->getPointerOperand());
5038         evaluatePtrUse(Store, Store->getValueOperand());
5039       }
5040     }
5041   for (auto *I : ScalarPtrs)
5042     if (!PossibleNonScalarPtrs.count(I)) {
5043       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5044       Worklist.insert(I);
5045     }
5046 
5047   // Insert the forced scalars.
5048   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5049   // induction variable when the PHI user is scalarized.
5050   auto ForcedScalar = ForcedScalars.find(VF);
5051   if (ForcedScalar != ForcedScalars.end())
5052     for (auto *I : ForcedScalar->second)
5053       Worklist.insert(I);
5054 
5055   // Expand the worklist by looking through any bitcasts and getelementptr
5056   // instructions we've already identified as scalar. This is similar to the
5057   // expansion step in collectLoopUniforms(); however, here we're only
5058   // expanding to include additional bitcasts and getelementptr instructions.
5059   unsigned Idx = 0;
5060   while (Idx != Worklist.size()) {
5061     Instruction *Dst = Worklist[Idx++];
5062     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5063       continue;
5064     auto *Src = cast<Instruction>(Dst->getOperand(0));
5065     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5066           auto *J = cast<Instruction>(U);
5067           return !TheLoop->contains(J) || Worklist.count(J) ||
5068                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5069                   isScalarUse(J, Src));
5070         })) {
5071       Worklist.insert(Src);
5072       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5073     }
5074   }
5075 
5076   // An induction variable will remain scalar if all users of the induction
5077   // variable and induction variable update remain scalar.
5078   for (auto &Induction : Legal->getInductionVars()) {
5079     auto *Ind = Induction.first;
5080     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5081 
5082     // If tail-folding is applied, the primary induction variable will be used
5083     // to feed a vector compare.
5084     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5085       continue;
5086 
5087     // Determine if all users of the induction variable are scalar after
5088     // vectorization.
5089     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5090       auto *I = cast<Instruction>(U);
5091       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5092     });
5093     if (!ScalarInd)
5094       continue;
5095 
5096     // Determine if all users of the induction variable update instruction are
5097     // scalar after vectorization.
5098     auto ScalarIndUpdate =
5099         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5100           auto *I = cast<Instruction>(U);
5101           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5102         });
5103     if (!ScalarIndUpdate)
5104       continue;
5105 
5106     // The induction variable and its update instruction will remain scalar.
5107     Worklist.insert(Ind);
5108     Worklist.insert(IndUpdate);
5109     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5110     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5111                       << "\n");
5112   }
5113 
5114   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5115 }
5116 
5117 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
5118                                                          ElementCount VF) {
5119   if (!blockNeedsPredication(I->getParent()))
5120     return false;
5121   switch(I->getOpcode()) {
5122   default:
5123     break;
5124   case Instruction::Load:
5125   case Instruction::Store: {
5126     if (!Legal->isMaskRequired(I))
5127       return false;
5128     auto *Ptr = getLoadStorePointerOperand(I);
5129     auto *Ty = getMemInstValueType(I);
5130     // We have already decided how to vectorize this instruction, get that
5131     // result.
5132     if (VF.isVector()) {
5133       InstWidening WideningDecision = getWideningDecision(I, VF);
5134       assert(WideningDecision != CM_Unknown &&
5135              "Widening decision should be ready at this moment");
5136       return WideningDecision == CM_Scalarize;
5137     }
5138     const Align Alignment = getLoadStoreAlignment(I);
5139     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5140                                 isLegalMaskedGather(Ty, Alignment))
5141                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5142                                 isLegalMaskedScatter(Ty, Alignment));
5143   }
5144   case Instruction::UDiv:
5145   case Instruction::SDiv:
5146   case Instruction::SRem:
5147   case Instruction::URem:
5148     return mayDivideByZero(*I);
5149   }
5150   return false;
5151 }
5152 
5153 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5154     Instruction *I, ElementCount VF) {
5155   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5156   assert(getWideningDecision(I, VF) == CM_Unknown &&
5157          "Decision should not be set yet.");
5158   auto *Group = getInterleavedAccessGroup(I);
5159   assert(Group && "Must have a group.");
5160 
5161   // If the instruction's allocated size doesn't equal it's type size, it
5162   // requires padding and will be scalarized.
5163   auto &DL = I->getModule()->getDataLayout();
5164   auto *ScalarTy = getMemInstValueType(I);
5165   if (hasIrregularType(ScalarTy, DL, VF))
5166     return false;
5167 
5168   // Check if masking is required.
5169   // A Group may need masking for one of two reasons: it resides in a block that
5170   // needs predication, or it was decided to use masking to deal with gaps.
5171   bool PredicatedAccessRequiresMasking =
5172       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5173   bool AccessWithGapsRequiresMasking =
5174       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5175   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5176     return true;
5177 
5178   // If masked interleaving is required, we expect that the user/target had
5179   // enabled it, because otherwise it either wouldn't have been created or
5180   // it should have been invalidated by the CostModel.
5181   assert(useMaskedInterleavedAccesses(TTI) &&
5182          "Masked interleave-groups for predicated accesses are not enabled.");
5183 
5184   auto *Ty = getMemInstValueType(I);
5185   const Align Alignment = getLoadStoreAlignment(I);
5186   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5187                           : TTI.isLegalMaskedStore(Ty, Alignment);
5188 }
5189 
5190 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5191     Instruction *I, ElementCount VF) {
5192   // Get and ensure we have a valid memory instruction.
5193   LoadInst *LI = dyn_cast<LoadInst>(I);
5194   StoreInst *SI = dyn_cast<StoreInst>(I);
5195   assert((LI || SI) && "Invalid memory instruction");
5196 
5197   auto *Ptr = getLoadStorePointerOperand(I);
5198 
5199   // In order to be widened, the pointer should be consecutive, first of all.
5200   if (!Legal->isConsecutivePtr(Ptr))
5201     return false;
5202 
5203   // If the instruction is a store located in a predicated block, it will be
5204   // scalarized.
5205   if (isScalarWithPredication(I))
5206     return false;
5207 
5208   // If the instruction's allocated size doesn't equal it's type size, it
5209   // requires padding and will be scalarized.
5210   auto &DL = I->getModule()->getDataLayout();
5211   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5212   if (hasIrregularType(ScalarTy, DL, VF))
5213     return false;
5214 
5215   return true;
5216 }
5217 
5218 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5219   // We should not collect Uniforms more than once per VF. Right now,
5220   // this function is called from collectUniformsAndScalars(), which
5221   // already does this check. Collecting Uniforms for VF=1 does not make any
5222   // sense.
5223 
5224   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5225          "This function should not be visited twice for the same VF");
5226 
5227   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5228   // not analyze again.  Uniforms.count(VF) will return 1.
5229   Uniforms[VF].clear();
5230 
5231   // We now know that the loop is vectorizable!
5232   // Collect instructions inside the loop that will remain uniform after
5233   // vectorization.
5234 
5235   // Global values, params and instructions outside of current loop are out of
5236   // scope.
5237   auto isOutOfScope = [&](Value *V) -> bool {
5238     Instruction *I = dyn_cast<Instruction>(V);
5239     return (!I || !TheLoop->contains(I));
5240   };
5241 
5242   SetVector<Instruction *> Worklist;
5243   BasicBlock *Latch = TheLoop->getLoopLatch();
5244 
5245   // Instructions that are scalar with predication must not be considered
5246   // uniform after vectorization, because that would create an erroneous
5247   // replicating region where only a single instance out of VF should be formed.
5248   // TODO: optimize such seldom cases if found important, see PR40816.
5249   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5250     if (isOutOfScope(I)) {
5251       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5252                         << *I << "\n");
5253       return;
5254     }
5255     if (isScalarWithPredication(I, VF)) {
5256       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5257                         << *I << "\n");
5258       return;
5259     }
5260     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5261     Worklist.insert(I);
5262   };
5263 
5264   // Start with the conditional branch. If the branch condition is an
5265   // instruction contained in the loop that is only used by the branch, it is
5266   // uniform.
5267   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5268   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5269     addToWorklistIfAllowed(Cmp);
5270 
5271   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5272     InstWidening WideningDecision = getWideningDecision(I, VF);
5273     assert(WideningDecision != CM_Unknown &&
5274            "Widening decision should be ready at this moment");
5275 
5276     // A uniform memory op is itself uniform.  We exclude uniform stores
5277     // here as they demand the last lane, not the first one.
5278     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5279       assert(WideningDecision == CM_Scalarize);
5280       return true;
5281     }
5282 
5283     return (WideningDecision == CM_Widen ||
5284             WideningDecision == CM_Widen_Reverse ||
5285             WideningDecision == CM_Interleave);
5286   };
5287 
5288 
5289   // Returns true if Ptr is the pointer operand of a memory access instruction
5290   // I, and I is known to not require scalarization.
5291   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5292     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5293   };
5294 
5295   // Holds a list of values which are known to have at least one uniform use.
5296   // Note that there may be other uses which aren't uniform.  A "uniform use"
5297   // here is something which only demands lane 0 of the unrolled iterations;
5298   // it does not imply that all lanes produce the same value (e.g. this is not
5299   // the usual meaning of uniform)
5300   SmallPtrSet<Value *, 8> HasUniformUse;
5301 
5302   // Scan the loop for instructions which are either a) known to have only
5303   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5304   for (auto *BB : TheLoop->blocks())
5305     for (auto &I : *BB) {
5306       // If there's no pointer operand, there's nothing to do.
5307       auto *Ptr = getLoadStorePointerOperand(&I);
5308       if (!Ptr)
5309         continue;
5310 
5311       // A uniform memory op is itself uniform.  We exclude uniform stores
5312       // here as they demand the last lane, not the first one.
5313       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5314         addToWorklistIfAllowed(&I);
5315 
5316       if (isUniformDecision(&I, VF)) {
5317         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5318         HasUniformUse.insert(Ptr);
5319       }
5320     }
5321 
5322   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5323   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5324   // disallows uses outside the loop as well.
5325   for (auto *V : HasUniformUse) {
5326     if (isOutOfScope(V))
5327       continue;
5328     auto *I = cast<Instruction>(V);
5329     auto UsersAreMemAccesses =
5330       llvm::all_of(I->users(), [&](User *U) -> bool {
5331         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5332       });
5333     if (UsersAreMemAccesses)
5334       addToWorklistIfAllowed(I);
5335   }
5336 
5337   // Expand Worklist in topological order: whenever a new instruction
5338   // is added , its users should be already inside Worklist.  It ensures
5339   // a uniform instruction will only be used by uniform instructions.
5340   unsigned idx = 0;
5341   while (idx != Worklist.size()) {
5342     Instruction *I = Worklist[idx++];
5343 
5344     for (auto OV : I->operand_values()) {
5345       // isOutOfScope operands cannot be uniform instructions.
5346       if (isOutOfScope(OV))
5347         continue;
5348       // First order recurrence Phi's should typically be considered
5349       // non-uniform.
5350       auto *OP = dyn_cast<PHINode>(OV);
5351       if (OP && Legal->isFirstOrderRecurrence(OP))
5352         continue;
5353       // If all the users of the operand are uniform, then add the
5354       // operand into the uniform worklist.
5355       auto *OI = cast<Instruction>(OV);
5356       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5357             auto *J = cast<Instruction>(U);
5358             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5359           }))
5360         addToWorklistIfAllowed(OI);
5361     }
5362   }
5363 
5364   // For an instruction to be added into Worklist above, all its users inside
5365   // the loop should also be in Worklist. However, this condition cannot be
5366   // true for phi nodes that form a cyclic dependence. We must process phi
5367   // nodes separately. An induction variable will remain uniform if all users
5368   // of the induction variable and induction variable update remain uniform.
5369   // The code below handles both pointer and non-pointer induction variables.
5370   for (auto &Induction : Legal->getInductionVars()) {
5371     auto *Ind = Induction.first;
5372     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5373 
5374     // Determine if all users of the induction variable are uniform after
5375     // vectorization.
5376     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5377       auto *I = cast<Instruction>(U);
5378       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5379              isVectorizedMemAccessUse(I, Ind);
5380     });
5381     if (!UniformInd)
5382       continue;
5383 
5384     // Determine if all users of the induction variable update instruction are
5385     // uniform after vectorization.
5386     auto UniformIndUpdate =
5387         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5388           auto *I = cast<Instruction>(U);
5389           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5390                  isVectorizedMemAccessUse(I, IndUpdate);
5391         });
5392     if (!UniformIndUpdate)
5393       continue;
5394 
5395     // The induction variable and its update instruction will remain uniform.
5396     addToWorklistIfAllowed(Ind);
5397     addToWorklistIfAllowed(IndUpdate);
5398   }
5399 
5400   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5401 }
5402 
5403 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5404   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5405 
5406   if (Legal->getRuntimePointerChecking()->Need) {
5407     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5408         "runtime pointer checks needed. Enable vectorization of this "
5409         "loop with '#pragma clang loop vectorize(enable)' when "
5410         "compiling with -Os/-Oz",
5411         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5412     return true;
5413   }
5414 
5415   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5416     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5417         "runtime SCEV checks needed. Enable vectorization of this "
5418         "loop with '#pragma clang loop vectorize(enable)' when "
5419         "compiling with -Os/-Oz",
5420         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5421     return true;
5422   }
5423 
5424   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5425   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5426     reportVectorizationFailure("Runtime stride check for small trip count",
5427         "runtime stride == 1 checks needed. Enable vectorization of "
5428         "this loop without such check by compiling with -Os/-Oz",
5429         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5430     return true;
5431   }
5432 
5433   return false;
5434 }
5435 
5436 Optional<ElementCount>
5437 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5438   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5439     // TODO: It may by useful to do since it's still likely to be dynamically
5440     // uniform if the target can skip.
5441     reportVectorizationFailure(
5442         "Not inserting runtime ptr check for divergent target",
5443         "runtime pointer checks needed. Not enabled for divergent target",
5444         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5445     return None;
5446   }
5447 
5448   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5449   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5450   if (TC == 1) {
5451     reportVectorizationFailure("Single iteration (non) loop",
5452         "loop trip count is one, irrelevant for vectorization",
5453         "SingleIterationLoop", ORE, TheLoop);
5454     return None;
5455   }
5456 
5457   ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5458 
5459   switch (ScalarEpilogueStatus) {
5460   case CM_ScalarEpilogueAllowed:
5461     return MaxVF;
5462   case CM_ScalarEpilogueNotAllowedUsePredicate:
5463     LLVM_FALLTHROUGH;
5464   case CM_ScalarEpilogueNotNeededUsePredicate:
5465     LLVM_DEBUG(
5466         dbgs() << "LV: vector predicate hint/switch found.\n"
5467                << "LV: Not allowing scalar epilogue, creating predicated "
5468                << "vector loop.\n");
5469     break;
5470   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5471     // fallthrough as a special case of OptForSize
5472   case CM_ScalarEpilogueNotAllowedOptSize:
5473     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5474       LLVM_DEBUG(
5475           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5476     else
5477       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5478                         << "count.\n");
5479 
5480     // Bail if runtime checks are required, which are not good when optimising
5481     // for size.
5482     if (runtimeChecksRequired())
5483       return None;
5484     break;
5485   }
5486 
5487   // Now try the tail folding
5488 
5489   // Invalidate interleave groups that require an epilogue if we can't mask
5490   // the interleave-group.
5491   if (!useMaskedInterleavedAccesses(TTI)) {
5492     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5493            "No decisions should have been taken at this point");
5494     // Note: There is no need to invalidate any cost modeling decisions here, as
5495     // non where taken so far.
5496     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5497   }
5498 
5499   assert(!MaxVF.isScalable() &&
5500          "Scalable vectors do not yet support tail folding");
5501   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5502          "MaxVF must be a power of 2");
5503   unsigned MaxVFtimesIC =
5504       UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5505   if (TC > 0 && TC % MaxVFtimesIC == 0) {
5506     // Accept MaxVF if we do not have a tail.
5507     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5508     return MaxVF;
5509   }
5510 
5511   // If we don't know the precise trip count, or if the trip count that we
5512   // found modulo the vectorization factor is not zero, try to fold the tail
5513   // by masking.
5514   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5515   if (Legal->prepareToFoldTailByMasking()) {
5516     FoldTailByMasking = true;
5517     return MaxVF;
5518   }
5519 
5520   // If there was a tail-folding hint/switch, but we can't fold the tail by
5521   // masking, fallback to a vectorization with a scalar epilogue.
5522   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5523     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5524                          "scalar epilogue instead.\n");
5525     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5526     return MaxVF;
5527   }
5528 
5529   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5530     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5531     return None;
5532   }
5533 
5534   if (TC == 0) {
5535     reportVectorizationFailure(
5536         "Unable to calculate the loop count due to complex control flow",
5537         "unable to calculate the loop count due to complex control flow",
5538         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5539     return None;
5540   }
5541 
5542   reportVectorizationFailure(
5543       "Cannot optimize for size and vectorize at the same time.",
5544       "cannot optimize for size and vectorize at the same time. "
5545       "Enable vectorization of this loop with '#pragma clang loop "
5546       "vectorize(enable)' when compiling with -Os/-Oz",
5547       "NoTailLoopWithOptForSize", ORE, TheLoop);
5548   return None;
5549 }
5550 
5551 ElementCount
5552 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5553                                                  ElementCount UserVF) {
5554   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5555   unsigned SmallestType, WidestType;
5556   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5557   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5558 
5559   // Get the maximum safe dependence distance in bits computed by LAA.
5560   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5561   // the memory accesses that is most restrictive (involved in the smallest
5562   // dependence distance).
5563   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5564 
5565   if (UserVF.isNonZero()) {
5566     // For now, don't verify legality of scalable vectors.
5567     // This will be addressed properly in https://reviews.llvm.org/D91718.
5568     if (UserVF.isScalable())
5569       return UserVF;
5570 
5571     // If legally unsafe, clamp the user vectorization factor to a safe value.
5572     unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5573     if (UserVF.getFixedValue() <= MaxSafeVF)
5574       return UserVF;
5575 
5576     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5577                       << " is unsafe, clamping to max safe VF=" << MaxSafeVF
5578                       << ".\n");
5579     ORE->emit([&]() {
5580       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5581                                         TheLoop->getStartLoc(),
5582                                         TheLoop->getHeader())
5583              << "User-specified vectorization factor "
5584              << ore::NV("UserVectorizationFactor", UserVF)
5585              << " is unsafe, clamping to maximum safe vectorization factor "
5586              << ore::NV("VectorizationFactor", MaxSafeVF);
5587     });
5588     return ElementCount::getFixed(MaxSafeVF);
5589   }
5590 
5591   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5592 
5593   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5594   // Note that both WidestRegister and WidestType may not be a powers of 2.
5595   unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
5596 
5597   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5598                     << " / " << WidestType << " bits.\n");
5599   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5600                     << WidestRegister << " bits.\n");
5601 
5602   assert(MaxVectorSize <= WidestRegister &&
5603          "Did not expect to pack so many elements"
5604          " into one vector!");
5605   if (MaxVectorSize == 0) {
5606     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5607     MaxVectorSize = 1;
5608     return ElementCount::getFixed(MaxVectorSize);
5609   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5610              isPowerOf2_32(ConstTripCount)) {
5611     // We need to clamp the VF to be the ConstTripCount. There is no point in
5612     // choosing a higher viable VF as done in the loop below.
5613     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5614                       << ConstTripCount << "\n");
5615     MaxVectorSize = ConstTripCount;
5616     return ElementCount::getFixed(MaxVectorSize);
5617   }
5618 
5619   unsigned MaxVF = MaxVectorSize;
5620   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5621       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5622     // Collect all viable vectorization factors larger than the default MaxVF
5623     // (i.e. MaxVectorSize).
5624     SmallVector<ElementCount, 8> VFs;
5625     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5626     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5627       VFs.push_back(ElementCount::getFixed(VS));
5628 
5629     // For each VF calculate its register usage.
5630     auto RUs = calculateRegisterUsage(VFs);
5631 
5632     // Select the largest VF which doesn't require more registers than existing
5633     // ones.
5634     for (int i = RUs.size() - 1; i >= 0; --i) {
5635       bool Selected = true;
5636       for (auto& pair : RUs[i].MaxLocalUsers) {
5637         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5638         if (pair.second > TargetNumRegisters)
5639           Selected = false;
5640       }
5641       if (Selected) {
5642         MaxVF = VFs[i].getKnownMinValue();
5643         break;
5644       }
5645     }
5646     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5647       if (MaxVF < MinVF) {
5648         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5649                           << ") with target's minimum: " << MinVF << '\n');
5650         MaxVF = MinVF;
5651       }
5652     }
5653   }
5654   return ElementCount::getFixed(MaxVF);
5655 }
5656 
5657 VectorizationFactor
5658 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5659   // FIXME: This can be fixed for scalable vectors later, because at this stage
5660   // the LoopVectorizer will only consider vectorizing a loop with scalable
5661   // vectors when the loop has a hint to enable vectorization for a given VF.
5662   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5663 
5664   float Cost = expectedCost(ElementCount::getFixed(1)).first;
5665   const float ScalarCost = Cost;
5666   unsigned Width = 1;
5667   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5668 
5669   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5670   if (ForceVectorization && MaxVF.isVector()) {
5671     // Ignore scalar width, because the user explicitly wants vectorization.
5672     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5673     // evaluation.
5674     Cost = std::numeric_limits<float>::max();
5675   }
5676 
5677   for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
5678     // Notice that the vector loop needs to be executed less times, so
5679     // we need to divide the cost of the vector loops by the width of
5680     // the vector elements.
5681     VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
5682     float VectorCost = C.first / (float)i;
5683     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5684                       << " costs: " << (int)VectorCost << ".\n");
5685     if (!C.second && !ForceVectorization) {
5686       LLVM_DEBUG(
5687           dbgs() << "LV: Not considering vector loop of width " << i
5688                  << " because it will not generate any vector instructions.\n");
5689       continue;
5690     }
5691 
5692     // If profitable add it to ProfitableVF list.
5693     if (VectorCost < ScalarCost) {
5694       ProfitableVFs.push_back(VectorizationFactor(
5695           {ElementCount::getFixed(i), (unsigned)VectorCost}));
5696     }
5697 
5698     if (VectorCost < Cost) {
5699       Cost = VectorCost;
5700       Width = i;
5701     }
5702   }
5703 
5704   if (!EnableCondStoresVectorization && NumPredStores) {
5705     reportVectorizationFailure("There are conditional stores.",
5706         "store that is conditionally executed prevents vectorization",
5707         "ConditionalStore", ORE, TheLoop);
5708     Width = 1;
5709     Cost = ScalarCost;
5710   }
5711 
5712   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5713              << "LV: Vectorization seems to be not beneficial, "
5714              << "but was forced by a user.\n");
5715   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5716   VectorizationFactor Factor = {ElementCount::getFixed(Width),
5717                                 (unsigned)(Width * Cost)};
5718   return Factor;
5719 }
5720 
5721 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5722     const Loop &L, ElementCount VF) const {
5723   // Cross iteration phis such as reductions need special handling and are
5724   // currently unsupported.
5725   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5726         return Legal->isFirstOrderRecurrence(&Phi) ||
5727                Legal->isReductionVariable(&Phi);
5728       }))
5729     return false;
5730 
5731   // Phis with uses outside of the loop require special handling and are
5732   // currently unsupported.
5733   for (auto &Entry : Legal->getInductionVars()) {
5734     // Look for uses of the value of the induction at the last iteration.
5735     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5736     for (User *U : PostInc->users())
5737       if (!L.contains(cast<Instruction>(U)))
5738         return false;
5739     // Look for uses of penultimate value of the induction.
5740     for (User *U : Entry.first->users())
5741       if (!L.contains(cast<Instruction>(U)))
5742         return false;
5743   }
5744 
5745   // Induction variables that are widened require special handling that is
5746   // currently not supported.
5747   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5748         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5749                  this->isProfitableToScalarize(Entry.first, VF));
5750       }))
5751     return false;
5752 
5753   return true;
5754 }
5755 
5756 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5757     const ElementCount VF) const {
5758   // FIXME: We need a much better cost-model to take different parameters such
5759   // as register pressure, code size increase and cost of extra branches into
5760   // account. For now we apply a very crude heuristic and only consider loops
5761   // with vectorization factors larger than a certain value.
5762   // We also consider epilogue vectorization unprofitable for targets that don't
5763   // consider interleaving beneficial (eg. MVE).
5764   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5765     return false;
5766   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5767     return true;
5768   return false;
5769 }
5770 
5771 VectorizationFactor
5772 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5773     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5774   VectorizationFactor Result = VectorizationFactor::Disabled();
5775   if (!EnableEpilogueVectorization) {
5776     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5777     return Result;
5778   }
5779 
5780   if (!isScalarEpilogueAllowed()) {
5781     LLVM_DEBUG(
5782         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5783                   "allowed.\n";);
5784     return Result;
5785   }
5786 
5787   // FIXME: This can be fixed for scalable vectors later, because at this stage
5788   // the LoopVectorizer will only consider vectorizing a loop with scalable
5789   // vectors when the loop has a hint to enable vectorization for a given VF.
5790   if (MainLoopVF.isScalable()) {
5791     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
5792                          "yet supported.\n");
5793     return Result;
5794   }
5795 
5796   // Not really a cost consideration, but check for unsupported cases here to
5797   // simplify the logic.
5798   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5799     LLVM_DEBUG(
5800         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5801                   "not a supported candidate.\n";);
5802     return Result;
5803   }
5804 
5805   if (EpilogueVectorizationForceVF > 1) {
5806     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5807     if (LVP.hasPlanWithVFs(
5808             {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
5809       return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
5810     else {
5811       LLVM_DEBUG(
5812           dbgs()
5813               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5814       return Result;
5815     }
5816   }
5817 
5818   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5819       TheLoop->getHeader()->getParent()->hasMinSize()) {
5820     LLVM_DEBUG(
5821         dbgs()
5822             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5823     return Result;
5824   }
5825 
5826   if (!isEpilogueVectorizationProfitable(MainLoopVF))
5827     return Result;
5828 
5829   for (auto &NextVF : ProfitableVFs)
5830     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
5831         (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
5832         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
5833       Result = NextVF;
5834 
5835   if (Result != VectorizationFactor::Disabled())
5836     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5837                       << Result.Width.getFixedValue() << "\n";);
5838   return Result;
5839 }
5840 
5841 std::pair<unsigned, unsigned>
5842 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5843   unsigned MinWidth = -1U;
5844   unsigned MaxWidth = 8;
5845   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5846 
5847   // For each block.
5848   for (BasicBlock *BB : TheLoop->blocks()) {
5849     // For each instruction in the loop.
5850     for (Instruction &I : BB->instructionsWithoutDebug()) {
5851       Type *T = I.getType();
5852 
5853       // Skip ignored values.
5854       if (ValuesToIgnore.count(&I))
5855         continue;
5856 
5857       // Only examine Loads, Stores and PHINodes.
5858       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5859         continue;
5860 
5861       // Examine PHI nodes that are reduction variables. Update the type to
5862       // account for the recurrence type.
5863       if (auto *PN = dyn_cast<PHINode>(&I)) {
5864         if (!Legal->isReductionVariable(PN))
5865           continue;
5866         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
5867         T = RdxDesc.getRecurrenceType();
5868       }
5869 
5870       // Examine the stored values.
5871       if (auto *ST = dyn_cast<StoreInst>(&I))
5872         T = ST->getValueOperand()->getType();
5873 
5874       // Ignore loaded pointer types and stored pointer types that are not
5875       // vectorizable.
5876       //
5877       // FIXME: The check here attempts to predict whether a load or store will
5878       //        be vectorized. We only know this for certain after a VF has
5879       //        been selected. Here, we assume that if an access can be
5880       //        vectorized, it will be. We should also look at extending this
5881       //        optimization to non-pointer types.
5882       //
5883       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5884           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5885         continue;
5886 
5887       MinWidth = std::min(MinWidth,
5888                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5889       MaxWidth = std::max(MaxWidth,
5890                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5891     }
5892   }
5893 
5894   return {MinWidth, MaxWidth};
5895 }
5896 
5897 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5898                                                            unsigned LoopCost) {
5899   // -- The interleave heuristics --
5900   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5901   // There are many micro-architectural considerations that we can't predict
5902   // at this level. For example, frontend pressure (on decode or fetch) due to
5903   // code size, or the number and capabilities of the execution ports.
5904   //
5905   // We use the following heuristics to select the interleave count:
5906   // 1. If the code has reductions, then we interleave to break the cross
5907   // iteration dependency.
5908   // 2. If the loop is really small, then we interleave to reduce the loop
5909   // overhead.
5910   // 3. We don't interleave if we think that we will spill registers to memory
5911   // due to the increased register pressure.
5912 
5913   if (!isScalarEpilogueAllowed())
5914     return 1;
5915 
5916   // We used the distance for the interleave count.
5917   if (Legal->getMaxSafeDepDistBytes() != -1U)
5918     return 1;
5919 
5920   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5921   const bool HasReductions = !Legal->getReductionVars().empty();
5922   // Do not interleave loops with a relatively small known or estimated trip
5923   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5924   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5925   // because with the above conditions interleaving can expose ILP and break
5926   // cross iteration dependences for reductions.
5927   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5928       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5929     return 1;
5930 
5931   RegisterUsage R = calculateRegisterUsage({VF})[0];
5932   // We divide by these constants so assume that we have at least one
5933   // instruction that uses at least one register.
5934   for (auto& pair : R.MaxLocalUsers) {
5935     pair.second = std::max(pair.second, 1U);
5936   }
5937 
5938   // We calculate the interleave count using the following formula.
5939   // Subtract the number of loop invariants from the number of available
5940   // registers. These registers are used by all of the interleaved instances.
5941   // Next, divide the remaining registers by the number of registers that is
5942   // required by the loop, in order to estimate how many parallel instances
5943   // fit without causing spills. All of this is rounded down if necessary to be
5944   // a power of two. We want power of two interleave count to simplify any
5945   // addressing operations or alignment considerations.
5946   // We also want power of two interleave counts to ensure that the induction
5947   // variable of the vector loop wraps to zero, when tail is folded by masking;
5948   // this currently happens when OptForSize, in which case IC is set to 1 above.
5949   unsigned IC = UINT_MAX;
5950 
5951   for (auto& pair : R.MaxLocalUsers) {
5952     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5953     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5954                       << " registers of "
5955                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5956     if (VF.isScalar()) {
5957       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5958         TargetNumRegisters = ForceTargetNumScalarRegs;
5959     } else {
5960       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5961         TargetNumRegisters = ForceTargetNumVectorRegs;
5962     }
5963     unsigned MaxLocalUsers = pair.second;
5964     unsigned LoopInvariantRegs = 0;
5965     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5966       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5967 
5968     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5969     // Don't count the induction variable as interleaved.
5970     if (EnableIndVarRegisterHeur) {
5971       TmpIC =
5972           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5973                         std::max(1U, (MaxLocalUsers - 1)));
5974     }
5975 
5976     IC = std::min(IC, TmpIC);
5977   }
5978 
5979   // Clamp the interleave ranges to reasonable counts.
5980   unsigned MaxInterleaveCount =
5981       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5982 
5983   // Check if the user has overridden the max.
5984   if (VF.isScalar()) {
5985     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5986       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5987   } else {
5988     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5989       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5990   }
5991 
5992   // If trip count is known or estimated compile time constant, limit the
5993   // interleave count to be less than the trip count divided by VF, provided it
5994   // is at least 1.
5995   //
5996   // For scalable vectors we can't know if interleaving is beneficial. It may
5997   // not be beneficial for small loops if none of the lanes in the second vector
5998   // iterations is enabled. However, for larger loops, there is likely to be a
5999   // similar benefit as for fixed-width vectors. For now, we choose to leave
6000   // the InterleaveCount as if vscale is '1', although if some information about
6001   // the vector is known (e.g. min vector size), we can make a better decision.
6002   if (BestKnownTC) {
6003     MaxInterleaveCount =
6004         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6005     // Make sure MaxInterleaveCount is greater than 0.
6006     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6007   }
6008 
6009   assert(MaxInterleaveCount > 0 &&
6010          "Maximum interleave count must be greater than 0");
6011 
6012   // Clamp the calculated IC to be between the 1 and the max interleave count
6013   // that the target and trip count allows.
6014   if (IC > MaxInterleaveCount)
6015     IC = MaxInterleaveCount;
6016   else
6017     // Make sure IC is greater than 0.
6018     IC = std::max(1u, IC);
6019 
6020   assert(IC > 0 && "Interleave count must be greater than 0.");
6021 
6022   // If we did not calculate the cost for VF (because the user selected the VF)
6023   // then we calculate the cost of VF here.
6024   if (LoopCost == 0)
6025     LoopCost = expectedCost(VF).first;
6026 
6027   assert(LoopCost && "Non-zero loop cost expected");
6028 
6029   // Interleave if we vectorized this loop and there is a reduction that could
6030   // benefit from interleaving.
6031   if (VF.isVector() && HasReductions) {
6032     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6033     return IC;
6034   }
6035 
6036   // Note that if we've already vectorized the loop we will have done the
6037   // runtime check and so interleaving won't require further checks.
6038   bool InterleavingRequiresRuntimePointerCheck =
6039       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6040 
6041   // We want to interleave small loops in order to reduce the loop overhead and
6042   // potentially expose ILP opportunities.
6043   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6044                     << "LV: IC is " << IC << '\n'
6045                     << "LV: VF is " << VF << '\n');
6046   const bool AggressivelyInterleaveReductions =
6047       TTI.enableAggressiveInterleaving(HasReductions);
6048   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6049     // We assume that the cost overhead is 1 and we use the cost model
6050     // to estimate the cost of the loop and interleave until the cost of the
6051     // loop overhead is about 5% of the cost of the loop.
6052     unsigned SmallIC =
6053         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6054 
6055     // Interleave until store/load ports (estimated by max interleave count) are
6056     // saturated.
6057     unsigned NumStores = Legal->getNumStores();
6058     unsigned NumLoads = Legal->getNumLoads();
6059     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6060     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6061 
6062     // If we have a scalar reduction (vector reductions are already dealt with
6063     // by this point), we can increase the critical path length if the loop
6064     // we're interleaving is inside another loop. Limit, by default to 2, so the
6065     // critical path only gets increased by one reduction operation.
6066     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6067       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6068       SmallIC = std::min(SmallIC, F);
6069       StoresIC = std::min(StoresIC, F);
6070       LoadsIC = std::min(LoadsIC, F);
6071     }
6072 
6073     if (EnableLoadStoreRuntimeInterleave &&
6074         std::max(StoresIC, LoadsIC) > SmallIC) {
6075       LLVM_DEBUG(
6076           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6077       return std::max(StoresIC, LoadsIC);
6078     }
6079 
6080     // If there are scalar reductions and TTI has enabled aggressive
6081     // interleaving for reductions, we will interleave to expose ILP.
6082     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6083         AggressivelyInterleaveReductions) {
6084       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6085       // Interleave no less than SmallIC but not as aggressive as the normal IC
6086       // to satisfy the rare situation when resources are too limited.
6087       return std::max(IC / 2, SmallIC);
6088     } else {
6089       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6090       return SmallIC;
6091     }
6092   }
6093 
6094   // Interleave if this is a large loop (small loops are already dealt with by
6095   // this point) that could benefit from interleaving.
6096   if (AggressivelyInterleaveReductions) {
6097     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6098     return IC;
6099   }
6100 
6101   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6102   return 1;
6103 }
6104 
6105 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6106 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6107   // This function calculates the register usage by measuring the highest number
6108   // of values that are alive at a single location. Obviously, this is a very
6109   // rough estimation. We scan the loop in a topological order in order and
6110   // assign a number to each instruction. We use RPO to ensure that defs are
6111   // met before their users. We assume that each instruction that has in-loop
6112   // users starts an interval. We record every time that an in-loop value is
6113   // used, so we have a list of the first and last occurrences of each
6114   // instruction. Next, we transpose this data structure into a multi map that
6115   // holds the list of intervals that *end* at a specific location. This multi
6116   // map allows us to perform a linear search. We scan the instructions linearly
6117   // and record each time that a new interval starts, by placing it in a set.
6118   // If we find this value in the multi-map then we remove it from the set.
6119   // The max register usage is the maximum size of the set.
6120   // We also search for instructions that are defined outside the loop, but are
6121   // used inside the loop. We need this number separately from the max-interval
6122   // usage number because when we unroll, loop-invariant values do not take
6123   // more register.
6124   LoopBlocksDFS DFS(TheLoop);
6125   DFS.perform(LI);
6126 
6127   RegisterUsage RU;
6128 
6129   // Each 'key' in the map opens a new interval. The values
6130   // of the map are the index of the 'last seen' usage of the
6131   // instruction that is the key.
6132   using IntervalMap = DenseMap<Instruction *, unsigned>;
6133 
6134   // Maps instruction to its index.
6135   SmallVector<Instruction *, 64> IdxToInstr;
6136   // Marks the end of each interval.
6137   IntervalMap EndPoint;
6138   // Saves the list of instruction indices that are used in the loop.
6139   SmallPtrSet<Instruction *, 8> Ends;
6140   // Saves the list of values that are used in the loop but are
6141   // defined outside the loop, such as arguments and constants.
6142   SmallPtrSet<Value *, 8> LoopInvariants;
6143 
6144   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6145     for (Instruction &I : BB->instructionsWithoutDebug()) {
6146       IdxToInstr.push_back(&I);
6147 
6148       // Save the end location of each USE.
6149       for (Value *U : I.operands()) {
6150         auto *Instr = dyn_cast<Instruction>(U);
6151 
6152         // Ignore non-instruction values such as arguments, constants, etc.
6153         if (!Instr)
6154           continue;
6155 
6156         // If this instruction is outside the loop then record it and continue.
6157         if (!TheLoop->contains(Instr)) {
6158           LoopInvariants.insert(Instr);
6159           continue;
6160         }
6161 
6162         // Overwrite previous end points.
6163         EndPoint[Instr] = IdxToInstr.size();
6164         Ends.insert(Instr);
6165       }
6166     }
6167   }
6168 
6169   // Saves the list of intervals that end with the index in 'key'.
6170   using InstrList = SmallVector<Instruction *, 2>;
6171   DenseMap<unsigned, InstrList> TransposeEnds;
6172 
6173   // Transpose the EndPoints to a list of values that end at each index.
6174   for (auto &Interval : EndPoint)
6175     TransposeEnds[Interval.second].push_back(Interval.first);
6176 
6177   SmallPtrSet<Instruction *, 8> OpenIntervals;
6178   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6179   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6180 
6181   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6182 
6183   // A lambda that gets the register usage for the given type and VF.
6184   const auto &TTICapture = TTI;
6185   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6186     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6187       return 0U;
6188     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6189   };
6190 
6191   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6192     Instruction *I = IdxToInstr[i];
6193 
6194     // Remove all of the instructions that end at this location.
6195     InstrList &List = TransposeEnds[i];
6196     for (Instruction *ToRemove : List)
6197       OpenIntervals.erase(ToRemove);
6198 
6199     // Ignore instructions that are never used within the loop.
6200     if (!Ends.count(I))
6201       continue;
6202 
6203     // Skip ignored values.
6204     if (ValuesToIgnore.count(I))
6205       continue;
6206 
6207     // For each VF find the maximum usage of registers.
6208     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6209       // Count the number of live intervals.
6210       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6211 
6212       if (VFs[j].isScalar()) {
6213         for (auto Inst : OpenIntervals) {
6214           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6215           if (RegUsage.find(ClassID) == RegUsage.end())
6216             RegUsage[ClassID] = 1;
6217           else
6218             RegUsage[ClassID] += 1;
6219         }
6220       } else {
6221         collectUniformsAndScalars(VFs[j]);
6222         for (auto Inst : OpenIntervals) {
6223           // Skip ignored values for VF > 1.
6224           if (VecValuesToIgnore.count(Inst))
6225             continue;
6226           if (isScalarAfterVectorization(Inst, VFs[j])) {
6227             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6228             if (RegUsage.find(ClassID) == RegUsage.end())
6229               RegUsage[ClassID] = 1;
6230             else
6231               RegUsage[ClassID] += 1;
6232           } else {
6233             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6234             if (RegUsage.find(ClassID) == RegUsage.end())
6235               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6236             else
6237               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6238           }
6239         }
6240       }
6241 
6242       for (auto& pair : RegUsage) {
6243         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6244           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6245         else
6246           MaxUsages[j][pair.first] = pair.second;
6247       }
6248     }
6249 
6250     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6251                       << OpenIntervals.size() << '\n');
6252 
6253     // Add the current instruction to the list of open intervals.
6254     OpenIntervals.insert(I);
6255   }
6256 
6257   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6258     SmallMapVector<unsigned, unsigned, 4> Invariant;
6259 
6260     for (auto Inst : LoopInvariants) {
6261       unsigned Usage =
6262           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6263       unsigned ClassID =
6264           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6265       if (Invariant.find(ClassID) == Invariant.end())
6266         Invariant[ClassID] = Usage;
6267       else
6268         Invariant[ClassID] += Usage;
6269     }
6270 
6271     LLVM_DEBUG({
6272       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6273       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6274              << " item\n";
6275       for (const auto &pair : MaxUsages[i]) {
6276         dbgs() << "LV(REG): RegisterClass: "
6277                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6278                << " registers\n";
6279       }
6280       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6281              << " item\n";
6282       for (const auto &pair : Invariant) {
6283         dbgs() << "LV(REG): RegisterClass: "
6284                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6285                << " registers\n";
6286       }
6287     });
6288 
6289     RU.LoopInvariantRegs = Invariant;
6290     RU.MaxLocalUsers = MaxUsages[i];
6291     RUs[i] = RU;
6292   }
6293 
6294   return RUs;
6295 }
6296 
6297 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6298   // TODO: Cost model for emulated masked load/store is completely
6299   // broken. This hack guides the cost model to use an artificially
6300   // high enough value to practically disable vectorization with such
6301   // operations, except where previously deployed legality hack allowed
6302   // using very low cost values. This is to avoid regressions coming simply
6303   // from moving "masked load/store" check from legality to cost model.
6304   // Masked Load/Gather emulation was previously never allowed.
6305   // Limited number of Masked Store/Scatter emulation was allowed.
6306   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
6307   return isa<LoadInst>(I) ||
6308          (isa<StoreInst>(I) &&
6309           NumPredStores > NumberOfStoresToPredicate);
6310 }
6311 
6312 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6313   // If we aren't vectorizing the loop, or if we've already collected the
6314   // instructions to scalarize, there's nothing to do. Collection may already
6315   // have occurred if we have a user-selected VF and are now computing the
6316   // expected cost for interleaving.
6317   if (VF.isScalar() || VF.isZero() ||
6318       InstsToScalarize.find(VF) != InstsToScalarize.end())
6319     return;
6320 
6321   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6322   // not profitable to scalarize any instructions, the presence of VF in the
6323   // map will indicate that we've analyzed it already.
6324   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6325 
6326   // Find all the instructions that are scalar with predication in the loop and
6327   // determine if it would be better to not if-convert the blocks they are in.
6328   // If so, we also record the instructions to scalarize.
6329   for (BasicBlock *BB : TheLoop->blocks()) {
6330     if (!blockNeedsPredication(BB))
6331       continue;
6332     for (Instruction &I : *BB)
6333       if (isScalarWithPredication(&I)) {
6334         ScalarCostsTy ScalarCosts;
6335         // Do not apply discount logic if hacked cost is needed
6336         // for emulated masked memrefs.
6337         if (!useEmulatedMaskMemRefHack(&I) &&
6338             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6339           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6340         // Remember that BB will remain after vectorization.
6341         PredicatedBBsAfterVectorization.insert(BB);
6342       }
6343   }
6344 }
6345 
6346 int LoopVectorizationCostModel::computePredInstDiscount(
6347     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
6348     ElementCount VF) {
6349   assert(!isUniformAfterVectorization(PredInst, VF) &&
6350          "Instruction marked uniform-after-vectorization will be predicated");
6351 
6352   // Initialize the discount to zero, meaning that the scalar version and the
6353   // vector version cost the same.
6354   int Discount = 0;
6355 
6356   // Holds instructions to analyze. The instructions we visit are mapped in
6357   // ScalarCosts. Those instructions are the ones that would be scalarized if
6358   // we find that the scalar version costs less.
6359   SmallVector<Instruction *, 8> Worklist;
6360 
6361   // Returns true if the given instruction can be scalarized.
6362   auto canBeScalarized = [&](Instruction *I) -> bool {
6363     // We only attempt to scalarize instructions forming a single-use chain
6364     // from the original predicated block that would otherwise be vectorized.
6365     // Although not strictly necessary, we give up on instructions we know will
6366     // already be scalar to avoid traversing chains that are unlikely to be
6367     // beneficial.
6368     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6369         isScalarAfterVectorization(I, VF))
6370       return false;
6371 
6372     // If the instruction is scalar with predication, it will be analyzed
6373     // separately. We ignore it within the context of PredInst.
6374     if (isScalarWithPredication(I))
6375       return false;
6376 
6377     // If any of the instruction's operands are uniform after vectorization,
6378     // the instruction cannot be scalarized. This prevents, for example, a
6379     // masked load from being scalarized.
6380     //
6381     // We assume we will only emit a value for lane zero of an instruction
6382     // marked uniform after vectorization, rather than VF identical values.
6383     // Thus, if we scalarize an instruction that uses a uniform, we would
6384     // create uses of values corresponding to the lanes we aren't emitting code
6385     // for. This behavior can be changed by allowing getScalarValue to clone
6386     // the lane zero values for uniforms rather than asserting.
6387     for (Use &U : I->operands())
6388       if (auto *J = dyn_cast<Instruction>(U.get()))
6389         if (isUniformAfterVectorization(J, VF))
6390           return false;
6391 
6392     // Otherwise, we can scalarize the instruction.
6393     return true;
6394   };
6395 
6396   // Compute the expected cost discount from scalarizing the entire expression
6397   // feeding the predicated instruction. We currently only consider expressions
6398   // that are single-use instruction chains.
6399   Worklist.push_back(PredInst);
6400   while (!Worklist.empty()) {
6401     Instruction *I = Worklist.pop_back_val();
6402 
6403     // If we've already analyzed the instruction, there's nothing to do.
6404     if (ScalarCosts.find(I) != ScalarCosts.end())
6405       continue;
6406 
6407     // Compute the cost of the vector instruction. Note that this cost already
6408     // includes the scalarization overhead of the predicated instruction.
6409     unsigned VectorCost = getInstructionCost(I, VF).first;
6410 
6411     // Compute the cost of the scalarized instruction. This cost is the cost of
6412     // the instruction as if it wasn't if-converted and instead remained in the
6413     // predicated block. We will scale this cost by block probability after
6414     // computing the scalarization overhead.
6415     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6416     unsigned ScalarCost =
6417         VF.getKnownMinValue() *
6418         getInstructionCost(I, ElementCount::getFixed(1)).first;
6419 
6420     // Compute the scalarization overhead of needed insertelement instructions
6421     // and phi nodes.
6422     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6423       ScalarCost += TTI.getScalarizationOverhead(
6424           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6425           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6426       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6427       ScalarCost +=
6428           VF.getKnownMinValue() *
6429           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6430     }
6431 
6432     // Compute the scalarization overhead of needed extractelement
6433     // instructions. For each of the instruction's operands, if the operand can
6434     // be scalarized, add it to the worklist; otherwise, account for the
6435     // overhead.
6436     for (Use &U : I->operands())
6437       if (auto *J = dyn_cast<Instruction>(U.get())) {
6438         assert(VectorType::isValidElementType(J->getType()) &&
6439                "Instruction has non-scalar type");
6440         if (canBeScalarized(J))
6441           Worklist.push_back(J);
6442         else if (needsExtract(J, VF)) {
6443           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6444           ScalarCost += TTI.getScalarizationOverhead(
6445               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6446               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6447         }
6448       }
6449 
6450     // Scale the total scalar cost by block probability.
6451     ScalarCost /= getReciprocalPredBlockProb();
6452 
6453     // Compute the discount. A non-negative discount means the vector version
6454     // of the instruction costs more, and scalarizing would be beneficial.
6455     Discount += VectorCost - ScalarCost;
6456     ScalarCosts[I] = ScalarCost;
6457   }
6458 
6459   return Discount;
6460 }
6461 
6462 LoopVectorizationCostModel::VectorizationCostTy
6463 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6464   VectorizationCostTy Cost;
6465 
6466   // For each block.
6467   for (BasicBlock *BB : TheLoop->blocks()) {
6468     VectorizationCostTy BlockCost;
6469 
6470     // For each instruction in the old loop.
6471     for (Instruction &I : BB->instructionsWithoutDebug()) {
6472       // Skip ignored values.
6473       if (ValuesToIgnore.count(&I) ||
6474           (VF.isVector() && VecValuesToIgnore.count(&I)))
6475         continue;
6476 
6477       VectorizationCostTy C = getInstructionCost(&I, VF);
6478 
6479       // Check if we should override the cost.
6480       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6481         C.first = ForceTargetInstructionCost;
6482 
6483       BlockCost.first += C.first;
6484       BlockCost.second |= C.second;
6485       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6486                         << " for VF " << VF << " For instruction: " << I
6487                         << '\n');
6488     }
6489 
6490     // If we are vectorizing a predicated block, it will have been
6491     // if-converted. This means that the block's instructions (aside from
6492     // stores and instructions that may divide by zero) will now be
6493     // unconditionally executed. For the scalar case, we may not always execute
6494     // the predicated block, if it is an if-else block. Thus, scale the block's
6495     // cost by the probability of executing it. blockNeedsPredication from
6496     // Legal is used so as to not include all blocks in tail folded loops.
6497     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6498       BlockCost.first /= getReciprocalPredBlockProb();
6499 
6500     Cost.first += BlockCost.first;
6501     Cost.second |= BlockCost.second;
6502   }
6503 
6504   return Cost;
6505 }
6506 
6507 /// Gets Address Access SCEV after verifying that the access pattern
6508 /// is loop invariant except the induction variable dependence.
6509 ///
6510 /// This SCEV can be sent to the Target in order to estimate the address
6511 /// calculation cost.
6512 static const SCEV *getAddressAccessSCEV(
6513               Value *Ptr,
6514               LoopVectorizationLegality *Legal,
6515               PredicatedScalarEvolution &PSE,
6516               const Loop *TheLoop) {
6517 
6518   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6519   if (!Gep)
6520     return nullptr;
6521 
6522   // We are looking for a gep with all loop invariant indices except for one
6523   // which should be an induction variable.
6524   auto SE = PSE.getSE();
6525   unsigned NumOperands = Gep->getNumOperands();
6526   for (unsigned i = 1; i < NumOperands; ++i) {
6527     Value *Opd = Gep->getOperand(i);
6528     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6529         !Legal->isInductionVariable(Opd))
6530       return nullptr;
6531   }
6532 
6533   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6534   return PSE.getSCEV(Ptr);
6535 }
6536 
6537 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6538   return Legal->hasStride(I->getOperand(0)) ||
6539          Legal->hasStride(I->getOperand(1));
6540 }
6541 
6542 unsigned
6543 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6544                                                         ElementCount VF) {
6545   assert(VF.isVector() &&
6546          "Scalarization cost of instruction implies vectorization.");
6547   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6548   Type *ValTy = getMemInstValueType(I);
6549   auto SE = PSE.getSE();
6550 
6551   unsigned AS = getLoadStoreAddressSpace(I);
6552   Value *Ptr = getLoadStorePointerOperand(I);
6553   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6554 
6555   // Figure out whether the access is strided and get the stride value
6556   // if it's known in compile time
6557   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6558 
6559   // Get the cost of the scalar memory instruction and address computation.
6560   unsigned Cost =
6561       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6562 
6563   // Don't pass *I here, since it is scalar but will actually be part of a
6564   // vectorized loop where the user of it is a vectorized instruction.
6565   const Align Alignment = getLoadStoreAlignment(I);
6566   Cost += VF.getKnownMinValue() *
6567           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6568                               AS, TTI::TCK_RecipThroughput);
6569 
6570   // Get the overhead of the extractelement and insertelement instructions
6571   // we might create due to scalarization.
6572   Cost += getScalarizationOverhead(I, VF);
6573 
6574   // If we have a predicated store, it may not be executed for each vector
6575   // lane. Scale the cost by the probability of executing the predicated
6576   // block.
6577   if (isPredicatedInst(I)) {
6578     Cost /= getReciprocalPredBlockProb();
6579 
6580     if (useEmulatedMaskMemRefHack(I))
6581       // Artificially setting to a high enough value to practically disable
6582       // vectorization with such operations.
6583       Cost = 3000000;
6584   }
6585 
6586   return Cost;
6587 }
6588 
6589 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6590                                                              ElementCount VF) {
6591   Type *ValTy = getMemInstValueType(I);
6592   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6593   Value *Ptr = getLoadStorePointerOperand(I);
6594   unsigned AS = getLoadStoreAddressSpace(I);
6595   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6596   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6597 
6598   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6599          "Stride should be 1 or -1 for consecutive memory access");
6600   const Align Alignment = getLoadStoreAlignment(I);
6601   unsigned Cost = 0;
6602   if (Legal->isMaskRequired(I))
6603     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6604                                       CostKind);
6605   else
6606     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6607                                 CostKind, I);
6608 
6609   bool Reverse = ConsecutiveStride < 0;
6610   if (Reverse)
6611     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6612   return Cost;
6613 }
6614 
6615 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6616                                                          ElementCount VF) {
6617   assert(Legal->isUniformMemOp(*I));
6618 
6619   Type *ValTy = getMemInstValueType(I);
6620   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6621   const Align Alignment = getLoadStoreAlignment(I);
6622   unsigned AS = getLoadStoreAddressSpace(I);
6623   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6624   if (isa<LoadInst>(I)) {
6625     return TTI.getAddressComputationCost(ValTy) +
6626            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6627                                CostKind) +
6628            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6629   }
6630   StoreInst *SI = cast<StoreInst>(I);
6631 
6632   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6633   return TTI.getAddressComputationCost(ValTy) +
6634          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6635                              CostKind) +
6636          (isLoopInvariantStoreValue
6637               ? 0
6638               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6639                                        VF.getKnownMinValue() - 1));
6640 }
6641 
6642 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6643                                                           ElementCount VF) {
6644   Type *ValTy = getMemInstValueType(I);
6645   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6646   const Align Alignment = getLoadStoreAlignment(I);
6647   const Value *Ptr = getLoadStorePointerOperand(I);
6648 
6649   return TTI.getAddressComputationCost(VectorTy) +
6650          TTI.getGatherScatterOpCost(
6651              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6652              TargetTransformInfo::TCK_RecipThroughput, I);
6653 }
6654 
6655 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6656                                                             ElementCount VF) {
6657   Type *ValTy = getMemInstValueType(I);
6658   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6659   unsigned AS = getLoadStoreAddressSpace(I);
6660 
6661   auto Group = getInterleavedAccessGroup(I);
6662   assert(Group && "Fail to get an interleaved access group.");
6663 
6664   unsigned InterleaveFactor = Group->getFactor();
6665   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6666   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6667 
6668   // Holds the indices of existing members in an interleaved load group.
6669   // An interleaved store group doesn't need this as it doesn't allow gaps.
6670   SmallVector<unsigned, 4> Indices;
6671   if (isa<LoadInst>(I)) {
6672     for (unsigned i = 0; i < InterleaveFactor; i++)
6673       if (Group->getMember(i))
6674         Indices.push_back(i);
6675   }
6676 
6677   // Calculate the cost of the whole interleaved group.
6678   bool UseMaskForGaps =
6679       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6680   unsigned Cost = TTI.getInterleavedMemoryOpCost(
6681       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6682       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6683 
6684   if (Group->isReverse()) {
6685     // TODO: Add support for reversed masked interleaved access.
6686     assert(!Legal->isMaskRequired(I) &&
6687            "Reverse masked interleaved access not supported.");
6688     Cost += Group->getNumMembers() *
6689             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6690   }
6691   return Cost;
6692 }
6693 
6694 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6695                                                               ElementCount VF) {
6696   // Calculate scalar cost only. Vectorization cost should be ready at this
6697   // moment.
6698   if (VF.isScalar()) {
6699     Type *ValTy = getMemInstValueType(I);
6700     const Align Alignment = getLoadStoreAlignment(I);
6701     unsigned AS = getLoadStoreAddressSpace(I);
6702 
6703     return TTI.getAddressComputationCost(ValTy) +
6704            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6705                                TTI::TCK_RecipThroughput, I);
6706   }
6707   return getWideningCost(I, VF);
6708 }
6709 
6710 LoopVectorizationCostModel::VectorizationCostTy
6711 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6712                                                ElementCount VF) {
6713   // If we know that this instruction will remain uniform, check the cost of
6714   // the scalar version.
6715   if (isUniformAfterVectorization(I, VF))
6716     VF = ElementCount::getFixed(1);
6717 
6718   if (VF.isVector() && isProfitableToScalarize(I, VF))
6719     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6720 
6721   // Forced scalars do not have any scalarization overhead.
6722   auto ForcedScalar = ForcedScalars.find(VF);
6723   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6724     auto InstSet = ForcedScalar->second;
6725     if (InstSet.count(I))
6726       return VectorizationCostTy(
6727           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6728            VF.getKnownMinValue()),
6729           false);
6730   }
6731 
6732   Type *VectorTy;
6733   unsigned C = getInstructionCost(I, VF, VectorTy);
6734 
6735   bool TypeNotScalarized =
6736       VF.isVector() && VectorTy->isVectorTy() &&
6737       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
6738   return VectorizationCostTy(C, TypeNotScalarized);
6739 }
6740 
6741 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6742                                                               ElementCount VF) {
6743 
6744   assert(!VF.isScalable() &&
6745          "cannot compute scalarization overhead for scalable vectorization");
6746   if (VF.isScalar())
6747     return 0;
6748 
6749   unsigned Cost = 0;
6750   Type *RetTy = ToVectorTy(I->getType(), VF);
6751   if (!RetTy->isVoidTy() &&
6752       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6753     Cost += TTI.getScalarizationOverhead(
6754         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
6755         true, false);
6756 
6757   // Some targets keep addresses scalar.
6758   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6759     return Cost;
6760 
6761   // Some targets support efficient element stores.
6762   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6763     return Cost;
6764 
6765   // Collect operands to consider.
6766   CallInst *CI = dyn_cast<CallInst>(I);
6767   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
6768 
6769   // Skip operands that do not require extraction/scalarization and do not incur
6770   // any overhead.
6771   return Cost + TTI.getOperandsScalarizationOverhead(
6772                     filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
6773 }
6774 
6775 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6776   if (VF.isScalar())
6777     return;
6778   NumPredStores = 0;
6779   for (BasicBlock *BB : TheLoop->blocks()) {
6780     // For each instruction in the old loop.
6781     for (Instruction &I : *BB) {
6782       Value *Ptr =  getLoadStorePointerOperand(&I);
6783       if (!Ptr)
6784         continue;
6785 
6786       // TODO: We should generate better code and update the cost model for
6787       // predicated uniform stores. Today they are treated as any other
6788       // predicated store (see added test cases in
6789       // invariant-store-vectorization.ll).
6790       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6791         NumPredStores++;
6792 
6793       if (Legal->isUniformMemOp(I)) {
6794         // TODO: Avoid replicating loads and stores instead of
6795         // relying on instcombine to remove them.
6796         // Load: Scalar load + broadcast
6797         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6798         unsigned Cost = getUniformMemOpCost(&I, VF);
6799         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6800         continue;
6801       }
6802 
6803       // We assume that widening is the best solution when possible.
6804       if (memoryInstructionCanBeWidened(&I, VF)) {
6805         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6806         int ConsecutiveStride =
6807                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6808         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6809                "Expected consecutive stride.");
6810         InstWidening Decision =
6811             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6812         setWideningDecision(&I, VF, Decision, Cost);
6813         continue;
6814       }
6815 
6816       // Choose between Interleaving, Gather/Scatter or Scalarization.
6817       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6818       unsigned NumAccesses = 1;
6819       if (isAccessInterleaved(&I)) {
6820         auto Group = getInterleavedAccessGroup(&I);
6821         assert(Group && "Fail to get an interleaved access group.");
6822 
6823         // Make one decision for the whole group.
6824         if (getWideningDecision(&I, VF) != CM_Unknown)
6825           continue;
6826 
6827         NumAccesses = Group->getNumMembers();
6828         if (interleavedAccessCanBeWidened(&I, VF))
6829           InterleaveCost = getInterleaveGroupCost(&I, VF);
6830       }
6831 
6832       unsigned GatherScatterCost =
6833           isLegalGatherOrScatter(&I)
6834               ? getGatherScatterCost(&I, VF) * NumAccesses
6835               : std::numeric_limits<unsigned>::max();
6836 
6837       unsigned ScalarizationCost =
6838           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6839 
6840       // Choose better solution for the current VF,
6841       // write down this decision and use it during vectorization.
6842       unsigned Cost;
6843       InstWidening Decision;
6844       if (InterleaveCost <= GatherScatterCost &&
6845           InterleaveCost < ScalarizationCost) {
6846         Decision = CM_Interleave;
6847         Cost = InterleaveCost;
6848       } else if (GatherScatterCost < ScalarizationCost) {
6849         Decision = CM_GatherScatter;
6850         Cost = GatherScatterCost;
6851       } else {
6852         Decision = CM_Scalarize;
6853         Cost = ScalarizationCost;
6854       }
6855       // If the instructions belongs to an interleave group, the whole group
6856       // receives the same decision. The whole group receives the cost, but
6857       // the cost will actually be assigned to one instruction.
6858       if (auto Group = getInterleavedAccessGroup(&I))
6859         setWideningDecision(Group, VF, Decision, Cost);
6860       else
6861         setWideningDecision(&I, VF, Decision, Cost);
6862     }
6863   }
6864 
6865   // Make sure that any load of address and any other address computation
6866   // remains scalar unless there is gather/scatter support. This avoids
6867   // inevitable extracts into address registers, and also has the benefit of
6868   // activating LSR more, since that pass can't optimize vectorized
6869   // addresses.
6870   if (TTI.prefersVectorizedAddressing())
6871     return;
6872 
6873   // Start with all scalar pointer uses.
6874   SmallPtrSet<Instruction *, 8> AddrDefs;
6875   for (BasicBlock *BB : TheLoop->blocks())
6876     for (Instruction &I : *BB) {
6877       Instruction *PtrDef =
6878         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6879       if (PtrDef && TheLoop->contains(PtrDef) &&
6880           getWideningDecision(&I, VF) != CM_GatherScatter)
6881         AddrDefs.insert(PtrDef);
6882     }
6883 
6884   // Add all instructions used to generate the addresses.
6885   SmallVector<Instruction *, 4> Worklist;
6886   for (auto *I : AddrDefs)
6887     Worklist.push_back(I);
6888   while (!Worklist.empty()) {
6889     Instruction *I = Worklist.pop_back_val();
6890     for (auto &Op : I->operands())
6891       if (auto *InstOp = dyn_cast<Instruction>(Op))
6892         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6893             AddrDefs.insert(InstOp).second)
6894           Worklist.push_back(InstOp);
6895   }
6896 
6897   for (auto *I : AddrDefs) {
6898     if (isa<LoadInst>(I)) {
6899       // Setting the desired widening decision should ideally be handled in
6900       // by cost functions, but since this involves the task of finding out
6901       // if the loaded register is involved in an address computation, it is
6902       // instead changed here when we know this is the case.
6903       InstWidening Decision = getWideningDecision(I, VF);
6904       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6905         // Scalarize a widened load of address.
6906         setWideningDecision(
6907             I, VF, CM_Scalarize,
6908             (VF.getKnownMinValue() *
6909              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6910       else if (auto Group = getInterleavedAccessGroup(I)) {
6911         // Scalarize an interleave group of address loads.
6912         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6913           if (Instruction *Member = Group->getMember(I))
6914             setWideningDecision(
6915                 Member, VF, CM_Scalarize,
6916                 (VF.getKnownMinValue() *
6917                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6918         }
6919       }
6920     } else
6921       // Make sure I gets scalarized and a cost estimate without
6922       // scalarization overhead.
6923       ForcedScalars[VF].insert(I);
6924   }
6925 }
6926 
6927 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6928                                                         ElementCount VF,
6929                                                         Type *&VectorTy) {
6930   Type *RetTy = I->getType();
6931   if (canTruncateToMinimalBitwidth(I, VF))
6932     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6933   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6934   auto SE = PSE.getSE();
6935   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6936 
6937   // TODO: We need to estimate the cost of intrinsic calls.
6938   switch (I->getOpcode()) {
6939   case Instruction::GetElementPtr:
6940     // We mark this instruction as zero-cost because the cost of GEPs in
6941     // vectorized code depends on whether the corresponding memory instruction
6942     // is scalarized or not. Therefore, we handle GEPs with the memory
6943     // instruction cost.
6944     return 0;
6945   case Instruction::Br: {
6946     // In cases of scalarized and predicated instructions, there will be VF
6947     // predicated blocks in the vectorized loop. Each branch around these
6948     // blocks requires also an extract of its vector compare i1 element.
6949     bool ScalarPredicatedBB = false;
6950     BranchInst *BI = cast<BranchInst>(I);
6951     if (VF.isVector() && BI->isConditional() &&
6952         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6953          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6954       ScalarPredicatedBB = true;
6955 
6956     if (ScalarPredicatedBB) {
6957       // Return cost for branches around scalarized and predicated blocks.
6958       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6959       auto *Vec_i1Ty =
6960           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6961       return (TTI.getScalarizationOverhead(
6962                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6963                   false, true) +
6964               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
6965                VF.getKnownMinValue()));
6966     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6967       // The back-edge branch will remain, as will all scalar branches.
6968       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6969     else
6970       // This branch will be eliminated by if-conversion.
6971       return 0;
6972     // Note: We currently assume zero cost for an unconditional branch inside
6973     // a predicated block since it will become a fall-through, although we
6974     // may decide in the future to call TTI for all branches.
6975   }
6976   case Instruction::PHI: {
6977     auto *Phi = cast<PHINode>(I);
6978 
6979     // First-order recurrences are replaced by vector shuffles inside the loop.
6980     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6981     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6982       return TTI.getShuffleCost(
6983           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
6984           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
6985 
6986     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6987     // converted into select instructions. We require N - 1 selects per phi
6988     // node, where N is the number of incoming values.
6989     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6990       return (Phi->getNumIncomingValues() - 1) *
6991              TTI.getCmpSelInstrCost(
6992                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6993                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6994                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6995 
6996     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6997   }
6998   case Instruction::UDiv:
6999   case Instruction::SDiv:
7000   case Instruction::URem:
7001   case Instruction::SRem:
7002     // If we have a predicated instruction, it may not be executed for each
7003     // vector lane. Get the scalarization cost and scale this amount by the
7004     // probability of executing the predicated block. If the instruction is not
7005     // predicated, we fall through to the next case.
7006     if (VF.isVector() && isScalarWithPredication(I)) {
7007       unsigned Cost = 0;
7008 
7009       // These instructions have a non-void type, so account for the phi nodes
7010       // that we will create. This cost is likely to be zero. The phi node
7011       // cost, if any, should be scaled by the block probability because it
7012       // models a copy at the end of each predicated block.
7013       Cost += VF.getKnownMinValue() *
7014               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7015 
7016       // The cost of the non-predicated instruction.
7017       Cost += VF.getKnownMinValue() *
7018               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7019 
7020       // The cost of insertelement and extractelement instructions needed for
7021       // scalarization.
7022       Cost += getScalarizationOverhead(I, VF);
7023 
7024       // Scale the cost by the probability of executing the predicated blocks.
7025       // This assumes the predicated block for each vector lane is equally
7026       // likely.
7027       return Cost / getReciprocalPredBlockProb();
7028     }
7029     LLVM_FALLTHROUGH;
7030   case Instruction::Add:
7031   case Instruction::FAdd:
7032   case Instruction::Sub:
7033   case Instruction::FSub:
7034   case Instruction::Mul:
7035   case Instruction::FMul:
7036   case Instruction::FDiv:
7037   case Instruction::FRem:
7038   case Instruction::Shl:
7039   case Instruction::LShr:
7040   case Instruction::AShr:
7041   case Instruction::And:
7042   case Instruction::Or:
7043   case Instruction::Xor: {
7044     // Since we will replace the stride by 1 the multiplication should go away.
7045     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7046       return 0;
7047     // Certain instructions can be cheaper to vectorize if they have a constant
7048     // second vector operand. One example of this are shifts on x86.
7049     Value *Op2 = I->getOperand(1);
7050     TargetTransformInfo::OperandValueProperties Op2VP;
7051     TargetTransformInfo::OperandValueKind Op2VK =
7052         TTI.getOperandInfo(Op2, Op2VP);
7053     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7054       Op2VK = TargetTransformInfo::OK_UniformValue;
7055 
7056     SmallVector<const Value *, 4> Operands(I->operand_values());
7057     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7058     return N * TTI.getArithmeticInstrCost(
7059                    I->getOpcode(), VectorTy, CostKind,
7060                    TargetTransformInfo::OK_AnyValue,
7061                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7062   }
7063   case Instruction::FNeg: {
7064     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7065     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7066     return N * TTI.getArithmeticInstrCost(
7067                    I->getOpcode(), VectorTy, CostKind,
7068                    TargetTransformInfo::OK_AnyValue,
7069                    TargetTransformInfo::OK_AnyValue,
7070                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
7071                    I->getOperand(0), I);
7072   }
7073   case Instruction::Select: {
7074     SelectInst *SI = cast<SelectInst>(I);
7075     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7076     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7077     Type *CondTy = SI->getCondition()->getType();
7078     if (!ScalarCond) {
7079       assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7080       CondTy = VectorType::get(CondTy, VF);
7081     }
7082     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7083                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7084   }
7085   case Instruction::ICmp:
7086   case Instruction::FCmp: {
7087     Type *ValTy = I->getOperand(0)->getType();
7088     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7089     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7090       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7091     VectorTy = ToVectorTy(ValTy, VF);
7092     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7093                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7094   }
7095   case Instruction::Store:
7096   case Instruction::Load: {
7097     ElementCount Width = VF;
7098     if (Width.isVector()) {
7099       InstWidening Decision = getWideningDecision(I, Width);
7100       assert(Decision != CM_Unknown &&
7101              "CM decision should be taken at this point");
7102       if (Decision == CM_Scalarize)
7103         Width = ElementCount::getFixed(1);
7104     }
7105     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7106     return getMemoryInstructionCost(I, VF);
7107   }
7108   case Instruction::ZExt:
7109   case Instruction::SExt:
7110   case Instruction::FPToUI:
7111   case Instruction::FPToSI:
7112   case Instruction::FPExt:
7113   case Instruction::PtrToInt:
7114   case Instruction::IntToPtr:
7115   case Instruction::SIToFP:
7116   case Instruction::UIToFP:
7117   case Instruction::Trunc:
7118   case Instruction::FPTrunc:
7119   case Instruction::BitCast: {
7120     // Computes the CastContextHint from a Load/Store instruction.
7121     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7122       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7123              "Expected a load or a store!");
7124 
7125       if (VF.isScalar() || !TheLoop->contains(I))
7126         return TTI::CastContextHint::Normal;
7127 
7128       switch (getWideningDecision(I, VF)) {
7129       case LoopVectorizationCostModel::CM_GatherScatter:
7130         return TTI::CastContextHint::GatherScatter;
7131       case LoopVectorizationCostModel::CM_Interleave:
7132         return TTI::CastContextHint::Interleave;
7133       case LoopVectorizationCostModel::CM_Scalarize:
7134       case LoopVectorizationCostModel::CM_Widen:
7135         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7136                                         : TTI::CastContextHint::Normal;
7137       case LoopVectorizationCostModel::CM_Widen_Reverse:
7138         return TTI::CastContextHint::Reversed;
7139       case LoopVectorizationCostModel::CM_Unknown:
7140         llvm_unreachable("Instr did not go through cost modelling?");
7141       }
7142 
7143       llvm_unreachable("Unhandled case!");
7144     };
7145 
7146     unsigned Opcode = I->getOpcode();
7147     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7148     // For Trunc, the context is the only user, which must be a StoreInst.
7149     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7150       if (I->hasOneUse())
7151         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7152           CCH = ComputeCCH(Store);
7153     }
7154     // For Z/Sext, the context is the operand, which must be a LoadInst.
7155     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7156              Opcode == Instruction::FPExt) {
7157       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7158         CCH = ComputeCCH(Load);
7159     }
7160 
7161     // We optimize the truncation of induction variables having constant
7162     // integer steps. The cost of these truncations is the same as the scalar
7163     // operation.
7164     if (isOptimizableIVTruncate(I, VF)) {
7165       auto *Trunc = cast<TruncInst>(I);
7166       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7167                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7168     }
7169 
7170     Type *SrcScalarTy = I->getOperand(0)->getType();
7171     Type *SrcVecTy =
7172         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7173     if (canTruncateToMinimalBitwidth(I, VF)) {
7174       // This cast is going to be shrunk. This may remove the cast or it might
7175       // turn it into slightly different cast. For example, if MinBW == 16,
7176       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7177       //
7178       // Calculate the modified src and dest types.
7179       Type *MinVecTy = VectorTy;
7180       if (Opcode == Instruction::Trunc) {
7181         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7182         VectorTy =
7183             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7184       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7185         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7186         VectorTy =
7187             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7188       }
7189     }
7190 
7191     assert(!VF.isScalable() && "VF is assumed to be non scalable");
7192     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7193     return N *
7194            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7195   }
7196   case Instruction::Call: {
7197     bool NeedToScalarize;
7198     CallInst *CI = cast<CallInst>(I);
7199     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7200     if (getVectorIntrinsicIDForCall(CI, TLI))
7201       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
7202     return CallCost;
7203   }
7204   case Instruction::ExtractValue: {
7205     InstructionCost ExtractCost =
7206         TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7207     assert(ExtractCost.isValid() && "Invalid cost for ExtractValue");
7208     return *(ExtractCost.getValue());
7209   }
7210   default:
7211     // The cost of executing VF copies of the scalar instruction. This opcode
7212     // is unknown. Assume that it is the same as 'mul'.
7213     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
7214                                        Instruction::Mul, VectorTy, CostKind) +
7215            getScalarizationOverhead(I, VF);
7216   } // end of switch.
7217 }
7218 
7219 char LoopVectorize::ID = 0;
7220 
7221 static const char lv_name[] = "Loop Vectorization";
7222 
7223 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7224 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7225 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7226 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7227 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7228 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7229 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7230 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7231 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7232 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7233 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7234 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7235 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7236 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7237 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7238 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7239 
7240 namespace llvm {
7241 
7242 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7243 
7244 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7245                               bool VectorizeOnlyWhenForced) {
7246   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7247 }
7248 
7249 } // end namespace llvm
7250 
7251 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7252   // Check if the pointer operand of a load or store instruction is
7253   // consecutive.
7254   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7255     return Legal->isConsecutivePtr(Ptr);
7256   return false;
7257 }
7258 
7259 void LoopVectorizationCostModel::collectValuesToIgnore() {
7260   // Ignore ephemeral values.
7261   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7262 
7263   // Ignore type-promoting instructions we identified during reduction
7264   // detection.
7265   for (auto &Reduction : Legal->getReductionVars()) {
7266     RecurrenceDescriptor &RedDes = Reduction.second;
7267     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7268     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7269   }
7270   // Ignore type-casting instructions we identified during induction
7271   // detection.
7272   for (auto &Induction : Legal->getInductionVars()) {
7273     InductionDescriptor &IndDes = Induction.second;
7274     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7275     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7276   }
7277 }
7278 
7279 void LoopVectorizationCostModel::collectInLoopReductions() {
7280   for (auto &Reduction : Legal->getReductionVars()) {
7281     PHINode *Phi = Reduction.first;
7282     RecurrenceDescriptor &RdxDesc = Reduction.second;
7283 
7284     // We don't collect reductions that are type promoted (yet).
7285     if (RdxDesc.getRecurrenceType() != Phi->getType())
7286       continue;
7287 
7288     // If the target would prefer this reduction to happen "in-loop", then we
7289     // want to record it as such.
7290     unsigned Opcode = RdxDesc.getRecurrenceBinOp();
7291     if (!PreferInLoopReductions &&
7292         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7293                                    TargetTransformInfo::ReductionFlags()))
7294       continue;
7295 
7296     // Check that we can correctly put the reductions into the loop, by
7297     // finding the chain of operations that leads from the phi to the loop
7298     // exit value.
7299     SmallVector<Instruction *, 4> ReductionOperations =
7300         RdxDesc.getReductionOpChain(Phi, TheLoop);
7301     bool InLoop = !ReductionOperations.empty();
7302     if (InLoop)
7303       InLoopReductionChains[Phi] = ReductionOperations;
7304     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7305                       << " reduction for phi: " << *Phi << "\n");
7306   }
7307 }
7308 
7309 // TODO: we could return a pair of values that specify the max VF and
7310 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7311 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7312 // doesn't have a cost model that can choose which plan to execute if
7313 // more than one is generated.
7314 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7315                                  LoopVectorizationCostModel &CM) {
7316   unsigned WidestType;
7317   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7318   return WidestVectorRegBits / WidestType;
7319 }
7320 
7321 VectorizationFactor
7322 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7323   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7324   ElementCount VF = UserVF;
7325   // Outer loop handling: They may require CFG and instruction level
7326   // transformations before even evaluating whether vectorization is profitable.
7327   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7328   // the vectorization pipeline.
7329   if (!OrigLoop->isInnermost()) {
7330     // If the user doesn't provide a vectorization factor, determine a
7331     // reasonable one.
7332     if (UserVF.isZero()) {
7333       VF = ElementCount::getFixed(
7334           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
7335       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7336 
7337       // Make sure we have a VF > 1 for stress testing.
7338       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7339         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7340                           << "overriding computed VF.\n");
7341         VF = ElementCount::getFixed(4);
7342       }
7343     }
7344     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7345     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7346            "VF needs to be a power of two");
7347     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7348                       << "VF " << VF << " to build VPlans.\n");
7349     buildVPlans(VF, VF);
7350 
7351     // For VPlan build stress testing, we bail out after VPlan construction.
7352     if (VPlanBuildStressTest)
7353       return VectorizationFactor::Disabled();
7354 
7355     return {VF, 0 /*Cost*/};
7356   }
7357 
7358   LLVM_DEBUG(
7359       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7360                 "VPlan-native path.\n");
7361   return VectorizationFactor::Disabled();
7362 }
7363 
7364 Optional<VectorizationFactor>
7365 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7366   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7367   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7368   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7369     return None;
7370 
7371   // Invalidate interleave groups if all blocks of loop will be predicated.
7372   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7373       !useMaskedInterleavedAccesses(*TTI)) {
7374     LLVM_DEBUG(
7375         dbgs()
7376         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7377            "which requires masked-interleaved support.\n");
7378     if (CM.InterleaveInfo.invalidateGroups())
7379       // Invalidating interleave groups also requires invalidating all decisions
7380       // based on them, which includes widening decisions and uniform and scalar
7381       // values.
7382       CM.invalidateCostModelingDecisions();
7383   }
7384 
7385   ElementCount MaxVF = MaybeMaxVF.getValue();
7386   assert(MaxVF.isNonZero() && "MaxVF is zero.");
7387 
7388   if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) {
7389     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7390     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7391            "VF needs to be a power of two");
7392     // Collect the instructions (and their associated costs) that will be more
7393     // profitable to scalarize.
7394     CM.selectUserVectorizationFactor(UserVF);
7395     CM.collectInLoopReductions();
7396     buildVPlansWithVPRecipes(UserVF, UserVF);
7397     LLVM_DEBUG(printPlans(dbgs()));
7398     return {{UserVF, 0}};
7399   }
7400 
7401   assert(!MaxVF.isScalable() &&
7402          "Scalable vectors not yet supported beyond this point");
7403 
7404   for (ElementCount VF = ElementCount::getFixed(1);
7405        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7406     // Collect Uniform and Scalar instructions after vectorization with VF.
7407     CM.collectUniformsAndScalars(VF);
7408 
7409     // Collect the instructions (and their associated costs) that will be more
7410     // profitable to scalarize.
7411     if (VF.isVector())
7412       CM.collectInstsToScalarize(VF);
7413   }
7414 
7415   CM.collectInLoopReductions();
7416 
7417   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7418   LLVM_DEBUG(printPlans(dbgs()));
7419   if (MaxVF.isScalar())
7420     return VectorizationFactor::Disabled();
7421 
7422   // Select the optimal vectorization factor.
7423   return CM.selectVectorizationFactor(MaxVF);
7424 }
7425 
7426 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7427   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7428                     << '\n');
7429   BestVF = VF;
7430   BestUF = UF;
7431 
7432   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7433     return !Plan->hasVF(VF);
7434   });
7435   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7436 }
7437 
7438 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7439                                            DominatorTree *DT) {
7440   // Perform the actual loop transformation.
7441 
7442   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7443   VPCallbackILV CallbackILV(ILV);
7444 
7445   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7446 
7447   VPTransformState State{*BestVF, BestUF,      LI,
7448                          DT,      ILV.Builder, ILV.VectorLoopValueMap,
7449                          &ILV,    CallbackILV};
7450   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7451   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7452   State.CanonicalIV = ILV.Induction;
7453 
7454   ILV.printDebugTracesAtStart();
7455 
7456   //===------------------------------------------------===//
7457   //
7458   // Notice: any optimization or new instruction that go
7459   // into the code below should also be implemented in
7460   // the cost-model.
7461   //
7462   //===------------------------------------------------===//
7463 
7464   // 2. Copy and widen instructions from the old loop into the new loop.
7465   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7466   VPlans.front()->execute(&State);
7467 
7468   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7469   //    predication, updating analyses.
7470   ILV.fixVectorizedLoop();
7471 
7472   ILV.printDebugTracesAtEnd();
7473 }
7474 
7475 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7476     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7477 
7478   // We create new control-flow for the vectorized loop, so the original exit
7479   // conditions will be dead after vectorization if it's only used by the
7480   // terminator
7481   SmallVector<BasicBlock*> ExitingBlocks;
7482   OrigLoop->getExitingBlocks(ExitingBlocks);
7483   for (auto *BB : ExitingBlocks) {
7484     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7485     if (!Cmp || !Cmp->hasOneUse())
7486       continue;
7487 
7488     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7489     if (!DeadInstructions.insert(Cmp).second)
7490       continue;
7491 
7492     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7493     // TODO: can recurse through operands in general
7494     for (Value *Op : Cmp->operands()) {
7495       if (isa<TruncInst>(Op) && Op->hasOneUse())
7496           DeadInstructions.insert(cast<Instruction>(Op));
7497     }
7498   }
7499 
7500   // We create new "steps" for induction variable updates to which the original
7501   // induction variables map. An original update instruction will be dead if
7502   // all its users except the induction variable are dead.
7503   auto *Latch = OrigLoop->getLoopLatch();
7504   for (auto &Induction : Legal->getInductionVars()) {
7505     PHINode *Ind = Induction.first;
7506     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7507 
7508     // If the tail is to be folded by masking, the primary induction variable,
7509     // if exists, isn't dead: it will be used for masking. Don't kill it.
7510     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7511       continue;
7512 
7513     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7514           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7515         }))
7516       DeadInstructions.insert(IndUpdate);
7517 
7518     // We record as "Dead" also the type-casting instructions we had identified
7519     // during induction analysis. We don't need any handling for them in the
7520     // vectorized loop because we have proven that, under a proper runtime
7521     // test guarding the vectorized loop, the value of the phi, and the casted
7522     // value of the phi, are the same. The last instruction in this casting chain
7523     // will get its scalar/vector/widened def from the scalar/vector/widened def
7524     // of the respective phi node. Any other casts in the induction def-use chain
7525     // have no other uses outside the phi update chain, and will be ignored.
7526     InductionDescriptor &IndDes = Induction.second;
7527     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7528     DeadInstructions.insert(Casts.begin(), Casts.end());
7529   }
7530 }
7531 
7532 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7533 
7534 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7535 
7536 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7537                                         Instruction::BinaryOps BinOp) {
7538   // When unrolling and the VF is 1, we only need to add a simple scalar.
7539   Type *Ty = Val->getType();
7540   assert(!Ty->isVectorTy() && "Val must be a scalar");
7541 
7542   if (Ty->isFloatingPointTy()) {
7543     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7544 
7545     // Floating point operations had to be 'fast' to enable the unrolling.
7546     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
7547     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
7548   }
7549   Constant *C = ConstantInt::get(Ty, StartIdx);
7550   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7551 }
7552 
7553 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7554   SmallVector<Metadata *, 4> MDs;
7555   // Reserve first location for self reference to the LoopID metadata node.
7556   MDs.push_back(nullptr);
7557   bool IsUnrollMetadata = false;
7558   MDNode *LoopID = L->getLoopID();
7559   if (LoopID) {
7560     // First find existing loop unrolling disable metadata.
7561     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7562       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7563       if (MD) {
7564         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7565         IsUnrollMetadata =
7566             S && S->getString().startswith("llvm.loop.unroll.disable");
7567       }
7568       MDs.push_back(LoopID->getOperand(i));
7569     }
7570   }
7571 
7572   if (!IsUnrollMetadata) {
7573     // Add runtime unroll disable metadata.
7574     LLVMContext &Context = L->getHeader()->getContext();
7575     SmallVector<Metadata *, 1> DisableOperands;
7576     DisableOperands.push_back(
7577         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7578     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7579     MDs.push_back(DisableNode);
7580     MDNode *NewLoopID = MDNode::get(Context, MDs);
7581     // Set operand 0 to refer to the loop id itself.
7582     NewLoopID->replaceOperandWith(0, NewLoopID);
7583     L->setLoopID(NewLoopID);
7584   }
7585 }
7586 
7587 //===--------------------------------------------------------------------===//
7588 // EpilogueVectorizerMainLoop
7589 //===--------------------------------------------------------------------===//
7590 
7591 /// This function is partially responsible for generating the control flow
7592 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7593 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7594   MDNode *OrigLoopID = OrigLoop->getLoopID();
7595   Loop *Lp = createVectorLoopSkeleton("");
7596 
7597   // Generate the code to check the minimum iteration count of the vector
7598   // epilogue (see below).
7599   EPI.EpilogueIterationCountCheck =
7600       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
7601   EPI.EpilogueIterationCountCheck->setName("iter.check");
7602 
7603   // Generate the code to check any assumptions that we've made for SCEV
7604   // expressions.
7605   BasicBlock *SavedPreHeader = LoopVectorPreHeader;
7606   emitSCEVChecks(Lp, LoopScalarPreHeader);
7607 
7608   // If a safety check was generated save it.
7609   if (SavedPreHeader != LoopVectorPreHeader)
7610     EPI.SCEVSafetyCheck = SavedPreHeader;
7611 
7612   // Generate the code that checks at runtime if arrays overlap. We put the
7613   // checks into a separate block to make the more common case of few elements
7614   // faster.
7615   SavedPreHeader = LoopVectorPreHeader;
7616   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
7617 
7618   // If a safety check was generated save/overwite it.
7619   if (SavedPreHeader != LoopVectorPreHeader)
7620     EPI.MemSafetyCheck = SavedPreHeader;
7621 
7622   // Generate the iteration count check for the main loop, *after* the check
7623   // for the epilogue loop, so that the path-length is shorter for the case
7624   // that goes directly through the vector epilogue. The longer-path length for
7625   // the main loop is compensated for, by the gain from vectorizing the larger
7626   // trip count. Note: the branch will get updated later on when we vectorize
7627   // the epilogue.
7628   EPI.MainLoopIterationCountCheck =
7629       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
7630 
7631   // Generate the induction variable.
7632   OldInduction = Legal->getPrimaryInduction();
7633   Type *IdxTy = Legal->getWidestInductionType();
7634   Value *StartIdx = ConstantInt::get(IdxTy, 0);
7635   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7636   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7637   EPI.VectorTripCount = CountRoundDown;
7638   Induction =
7639       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7640                               getDebugLocFromInstOrOperands(OldInduction));
7641 
7642   // Skip induction resume value creation here because they will be created in
7643   // the second pass. If we created them here, they wouldn't be used anyway,
7644   // because the vplan in the second pass still contains the inductions from the
7645   // original loop.
7646 
7647   return completeLoopSkeleton(Lp, OrigLoopID);
7648 }
7649 
7650 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7651   LLVM_DEBUG({
7652     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7653            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7654            << ", Main Loop UF:" << EPI.MainLoopUF
7655            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7656            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7657   });
7658 }
7659 
7660 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7661   DEBUG_WITH_TYPE(VerboseDebug, {
7662     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
7663   });
7664 }
7665 
7666 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
7667     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
7668   assert(L && "Expected valid Loop.");
7669   assert(Bypass && "Expected valid bypass basic block.");
7670   unsigned VFactor =
7671       ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
7672   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7673   Value *Count = getOrCreateTripCount(L);
7674   // Reuse existing vector loop preheader for TC checks.
7675   // Note that new preheader block is generated for vector loop.
7676   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7677   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7678 
7679   // Generate code to check if the loop's trip count is less than VF * UF of the
7680   // main vector loop.
7681   auto P =
7682       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7683 
7684   Value *CheckMinIters = Builder.CreateICmp(
7685       P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
7686       "min.iters.check");
7687 
7688   if (!ForEpilogue)
7689     TCCheckBlock->setName("vector.main.loop.iter.check");
7690 
7691   // Create new preheader for vector loop.
7692   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7693                                    DT, LI, nullptr, "vector.ph");
7694 
7695   if (ForEpilogue) {
7696     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7697                                  DT->getNode(Bypass)->getIDom()) &&
7698            "TC check is expected to dominate Bypass");
7699 
7700     // Update dominator for Bypass & LoopExit.
7701     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7702     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7703 
7704     LoopBypassBlocks.push_back(TCCheckBlock);
7705 
7706     // Save the trip count so we don't have to regenerate it in the
7707     // vec.epilog.iter.check. This is safe to do because the trip count
7708     // generated here dominates the vector epilog iter check.
7709     EPI.TripCount = Count;
7710   }
7711 
7712   ReplaceInstWithInst(
7713       TCCheckBlock->getTerminator(),
7714       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7715 
7716   return TCCheckBlock;
7717 }
7718 
7719 //===--------------------------------------------------------------------===//
7720 // EpilogueVectorizerEpilogueLoop
7721 //===--------------------------------------------------------------------===//
7722 
7723 /// This function is partially responsible for generating the control flow
7724 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7725 BasicBlock *
7726 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7727   MDNode *OrigLoopID = OrigLoop->getLoopID();
7728   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
7729 
7730   // Now, compare the remaining count and if there aren't enough iterations to
7731   // execute the vectorized epilogue skip to the scalar part.
7732   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7733   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7734   LoopVectorPreHeader =
7735       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7736                  LI, nullptr, "vec.epilog.ph");
7737   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
7738                                           VecEpilogueIterationCountCheck);
7739 
7740   // Adjust the control flow taking the state info from the main loop
7741   // vectorization into account.
7742   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7743          "expected this to be saved from the previous pass.");
7744   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7745       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7746 
7747   DT->changeImmediateDominator(LoopVectorPreHeader,
7748                                EPI.MainLoopIterationCountCheck);
7749 
7750   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7751       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7752 
7753   if (EPI.SCEVSafetyCheck)
7754     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7755         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7756   if (EPI.MemSafetyCheck)
7757     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7758         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7759 
7760   DT->changeImmediateDominator(
7761       VecEpilogueIterationCountCheck,
7762       VecEpilogueIterationCountCheck->getSinglePredecessor());
7763 
7764   DT->changeImmediateDominator(LoopScalarPreHeader,
7765                                EPI.EpilogueIterationCountCheck);
7766   DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
7767 
7768   // Keep track of bypass blocks, as they feed start values to the induction
7769   // phis in the scalar loop preheader.
7770   if (EPI.SCEVSafetyCheck)
7771     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7772   if (EPI.MemSafetyCheck)
7773     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7774   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7775 
7776   // Generate a resume induction for the vector epilogue and put it in the
7777   // vector epilogue preheader
7778   Type *IdxTy = Legal->getWidestInductionType();
7779   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7780                                          LoopVectorPreHeader->getFirstNonPHI());
7781   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7782   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7783                            EPI.MainLoopIterationCountCheck);
7784 
7785   // Generate the induction variable.
7786   OldInduction = Legal->getPrimaryInduction();
7787   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7788   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7789   Value *StartIdx = EPResumeVal;
7790   Induction =
7791       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7792                               getDebugLocFromInstOrOperands(OldInduction));
7793 
7794   // Generate induction resume values. These variables save the new starting
7795   // indexes for the scalar loop. They are used to test if there are any tail
7796   // iterations left once the vector loop has completed.
7797   // Note that when the vectorized epilogue is skipped due to iteration count
7798   // check, then the resume value for the induction variable comes from
7799   // the trip count of the main vector loop, hence passing the AdditionalBypass
7800   // argument.
7801   createInductionResumeValues(Lp, CountRoundDown,
7802                               {VecEpilogueIterationCountCheck,
7803                                EPI.VectorTripCount} /* AdditionalBypass */);
7804 
7805   AddRuntimeUnrollDisableMetaData(Lp);
7806   return completeLoopSkeleton(Lp, OrigLoopID);
7807 }
7808 
7809 BasicBlock *
7810 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7811     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
7812 
7813   assert(EPI.TripCount &&
7814          "Expected trip count to have been safed in the first pass.");
7815   assert(
7816       (!isa<Instruction>(EPI.TripCount) ||
7817        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7818       "saved trip count does not dominate insertion point.");
7819   Value *TC = EPI.TripCount;
7820   IRBuilder<> Builder(Insert->getTerminator());
7821   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7822 
7823   // Generate code to check if the loop's trip count is less than VF * UF of the
7824   // vector epilogue loop.
7825   auto P =
7826       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7827 
7828   Value *CheckMinIters = Builder.CreateICmp(
7829       P, Count,
7830       ConstantInt::get(Count->getType(),
7831                        EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
7832       "min.epilog.iters.check");
7833 
7834   ReplaceInstWithInst(
7835       Insert->getTerminator(),
7836       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7837 
7838   LoopBypassBlocks.push_back(Insert);
7839   return Insert;
7840 }
7841 
7842 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7843   LLVM_DEBUG({
7844     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7845            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7846            << ", Main Loop UF:" << EPI.MainLoopUF
7847            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7848            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7849   });
7850 }
7851 
7852 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7853   DEBUG_WITH_TYPE(VerboseDebug, {
7854     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
7855   });
7856 }
7857 
7858 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7859     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7860   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7861   bool PredicateAtRangeStart = Predicate(Range.Start);
7862 
7863   for (ElementCount TmpVF = Range.Start * 2;
7864        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7865     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7866       Range.End = TmpVF;
7867       break;
7868     }
7869 
7870   return PredicateAtRangeStart;
7871 }
7872 
7873 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7874 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7875 /// of VF's starting at a given VF and extending it as much as possible. Each
7876 /// vectorization decision can potentially shorten this sub-range during
7877 /// buildVPlan().
7878 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7879                                            ElementCount MaxVF) {
7880   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7881   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7882     VFRange SubRange = {VF, MaxVFPlusOne};
7883     VPlans.push_back(buildVPlan(SubRange));
7884     VF = SubRange.End;
7885   }
7886 }
7887 
7888 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7889                                          VPlanPtr &Plan) {
7890   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7891 
7892   // Look for cached value.
7893   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7894   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7895   if (ECEntryIt != EdgeMaskCache.end())
7896     return ECEntryIt->second;
7897 
7898   VPValue *SrcMask = createBlockInMask(Src, Plan);
7899 
7900   // The terminator has to be a branch inst!
7901   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7902   assert(BI && "Unexpected terminator found");
7903 
7904   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7905     return EdgeMaskCache[Edge] = SrcMask;
7906 
7907   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
7908   assert(EdgeMask && "No Edge Mask found for condition");
7909 
7910   if (BI->getSuccessor(0) != Dst)
7911     EdgeMask = Builder.createNot(EdgeMask);
7912 
7913   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
7914     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
7915 
7916   return EdgeMaskCache[Edge] = EdgeMask;
7917 }
7918 
7919 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
7920   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7921 
7922   // Look for cached value.
7923   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
7924   if (BCEntryIt != BlockMaskCache.end())
7925     return BCEntryIt->second;
7926 
7927   // All-one mask is modelled as no-mask following the convention for masked
7928   // load/store/gather/scatter. Initialize BlockMask to no-mask.
7929   VPValue *BlockMask = nullptr;
7930 
7931   if (OrigLoop->getHeader() == BB) {
7932     if (!CM.blockNeedsPredication(BB))
7933       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
7934 
7935     // Create the block in mask as the first non-phi instruction in the block.
7936     VPBuilder::InsertPointGuard Guard(Builder);
7937     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
7938     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
7939 
7940     // Introduce the early-exit compare IV <= BTC to form header block mask.
7941     // This is used instead of IV < TC because TC may wrap, unlike BTC.
7942     // Start by constructing the desired canonical IV.
7943     VPValue *IV = nullptr;
7944     if (Legal->getPrimaryInduction())
7945       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
7946     else {
7947       auto IVRecipe = new VPWidenCanonicalIVRecipe();
7948       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
7949       IV = IVRecipe->getVPValue();
7950     }
7951     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
7952     bool TailFolded = !CM.isScalarEpilogueAllowed();
7953 
7954     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
7955       // While ActiveLaneMask is a binary op that consumes the loop tripcount
7956       // as a second argument, we only pass the IV here and extract the
7957       // tripcount from the transform state where codegen of the VP instructions
7958       // happen.
7959       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
7960     } else {
7961       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
7962     }
7963     return BlockMaskCache[BB] = BlockMask;
7964   }
7965 
7966   // This is the block mask. We OR all incoming edges.
7967   for (auto *Predecessor : predecessors(BB)) {
7968     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
7969     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
7970       return BlockMaskCache[BB] = EdgeMask;
7971 
7972     if (!BlockMask) { // BlockMask has its initialized nullptr value.
7973       BlockMask = EdgeMask;
7974       continue;
7975     }
7976 
7977     BlockMask = Builder.createOr(BlockMask, EdgeMask);
7978   }
7979 
7980   return BlockMaskCache[BB] = BlockMask;
7981 }
7982 
7983 VPWidenMemoryInstructionRecipe *
7984 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
7985                                   VPlanPtr &Plan) {
7986   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7987          "Must be called with either a load or store");
7988 
7989   auto willWiden = [&](ElementCount VF) -> bool {
7990     if (VF.isScalar())
7991       return false;
7992     LoopVectorizationCostModel::InstWidening Decision =
7993         CM.getWideningDecision(I, VF);
7994     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7995            "CM decision should be taken at this point.");
7996     if (Decision == LoopVectorizationCostModel::CM_Interleave)
7997       return true;
7998     if (CM.isScalarAfterVectorization(I, VF) ||
7999         CM.isProfitableToScalarize(I, VF))
8000       return false;
8001     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8002   };
8003 
8004   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8005     return nullptr;
8006 
8007   VPValue *Mask = nullptr;
8008   if (Legal->isMaskRequired(I))
8009     Mask = createBlockInMask(I->getParent(), Plan);
8010 
8011   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
8012   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8013     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
8014 
8015   StoreInst *Store = cast<StoreInst>(I);
8016   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
8017   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
8018 }
8019 
8020 VPWidenIntOrFpInductionRecipe *
8021 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
8022   // Check if this is an integer or fp induction. If so, build the recipe that
8023   // produces its scalar and vector values.
8024   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8025   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8026       II.getKind() == InductionDescriptor::IK_FpInduction)
8027     return new VPWidenIntOrFpInductionRecipe(Phi);
8028 
8029   return nullptr;
8030 }
8031 
8032 VPWidenIntOrFpInductionRecipe *
8033 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
8034                                                 VFRange &Range) const {
8035   // Optimize the special case where the source is a constant integer
8036   // induction variable. Notice that we can only optimize the 'trunc' case
8037   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8038   // (c) other casts depend on pointer size.
8039 
8040   // Determine whether \p K is a truncation based on an induction variable that
8041   // can be optimized.
8042   auto isOptimizableIVTruncate =
8043       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8044     return [=](ElementCount VF) -> bool {
8045       return CM.isOptimizableIVTruncate(K, VF);
8046     };
8047   };
8048 
8049   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8050           isOptimizableIVTruncate(I), Range))
8051     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8052                                              I);
8053   return nullptr;
8054 }
8055 
8056 VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
8057   // We know that all PHIs in non-header blocks are converted into selects, so
8058   // we don't have to worry about the insertion order and we can just use the
8059   // builder. At this point we generate the predication tree. There may be
8060   // duplications since this is a simple recursive scan, but future
8061   // optimizations will clean it up.
8062 
8063   SmallVector<VPValue *, 2> Operands;
8064   unsigned NumIncoming = Phi->getNumIncomingValues();
8065   for (unsigned In = 0; In < NumIncoming; In++) {
8066     VPValue *EdgeMask =
8067       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8068     assert((EdgeMask || NumIncoming == 1) &&
8069            "Multiple predecessors with one having a full mask");
8070     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
8071     if (EdgeMask)
8072       Operands.push_back(EdgeMask);
8073   }
8074   return new VPBlendRecipe(Phi, Operands);
8075 }
8076 
8077 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
8078                                                    VPlan &Plan) const {
8079 
8080   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8081       [this, CI](ElementCount VF) {
8082         return CM.isScalarWithPredication(CI, VF);
8083       },
8084       Range);
8085 
8086   if (IsPredicated)
8087     return nullptr;
8088 
8089   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8090   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8091              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8092              ID == Intrinsic::pseudoprobe))
8093     return nullptr;
8094 
8095   auto willWiden = [&](ElementCount VF) -> bool {
8096     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8097     // The following case may be scalarized depending on the VF.
8098     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8099     // version of the instruction.
8100     // Is it beneficial to perform intrinsic call compared to lib call?
8101     bool NeedToScalarize = false;
8102     unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8103     bool UseVectorIntrinsic =
8104         ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
8105     return UseVectorIntrinsic || !NeedToScalarize;
8106   };
8107 
8108   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8109     return nullptr;
8110 
8111   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
8112 }
8113 
8114 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8115   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8116          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8117   // Instruction should be widened, unless it is scalar after vectorization,
8118   // scalarization is profitable or it is predicated.
8119   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8120     return CM.isScalarAfterVectorization(I, VF) ||
8121            CM.isProfitableToScalarize(I, VF) ||
8122            CM.isScalarWithPredication(I, VF);
8123   };
8124   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8125                                                              Range);
8126 }
8127 
8128 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
8129   auto IsVectorizableOpcode = [](unsigned Opcode) {
8130     switch (Opcode) {
8131     case Instruction::Add:
8132     case Instruction::And:
8133     case Instruction::AShr:
8134     case Instruction::BitCast:
8135     case Instruction::FAdd:
8136     case Instruction::FCmp:
8137     case Instruction::FDiv:
8138     case Instruction::FMul:
8139     case Instruction::FNeg:
8140     case Instruction::FPExt:
8141     case Instruction::FPToSI:
8142     case Instruction::FPToUI:
8143     case Instruction::FPTrunc:
8144     case Instruction::FRem:
8145     case Instruction::FSub:
8146     case Instruction::ICmp:
8147     case Instruction::IntToPtr:
8148     case Instruction::LShr:
8149     case Instruction::Mul:
8150     case Instruction::Or:
8151     case Instruction::PtrToInt:
8152     case Instruction::SDiv:
8153     case Instruction::Select:
8154     case Instruction::SExt:
8155     case Instruction::Shl:
8156     case Instruction::SIToFP:
8157     case Instruction::SRem:
8158     case Instruction::Sub:
8159     case Instruction::Trunc:
8160     case Instruction::UDiv:
8161     case Instruction::UIToFP:
8162     case Instruction::URem:
8163     case Instruction::Xor:
8164     case Instruction::ZExt:
8165       return true;
8166     }
8167     return false;
8168   };
8169 
8170   if (!IsVectorizableOpcode(I->getOpcode()))
8171     return nullptr;
8172 
8173   // Success: widen this instruction.
8174   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
8175 }
8176 
8177 VPBasicBlock *VPRecipeBuilder::handleReplication(
8178     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8179     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
8180     VPlanPtr &Plan) {
8181   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8182       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8183       Range);
8184 
8185   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8186       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
8187       Range);
8188 
8189   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8190                                        IsUniform, IsPredicated);
8191   setRecipe(I, Recipe);
8192   Plan->addVPValue(I, Recipe);
8193 
8194   // Find if I uses a predicated instruction. If so, it will use its scalar
8195   // value. Avoid hoisting the insert-element which packs the scalar value into
8196   // a vector value, as that happens iff all users use the vector value.
8197   for (auto &Op : I->operands())
8198     if (auto *PredInst = dyn_cast<Instruction>(Op))
8199       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
8200         PredInst2Recipe[PredInst]->setAlsoPack(false);
8201 
8202   // Finalize the recipe for Instr, first if it is not predicated.
8203   if (!IsPredicated) {
8204     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8205     VPBB->appendRecipe(Recipe);
8206     return VPBB;
8207   }
8208   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8209   assert(VPBB->getSuccessors().empty() &&
8210          "VPBB has successors when handling predicated replication.");
8211   // Record predicated instructions for above packing optimizations.
8212   PredInst2Recipe[I] = Recipe;
8213   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8214   VPBlockUtils::insertBlockAfter(Region, VPBB);
8215   auto *RegSucc = new VPBasicBlock();
8216   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8217   return RegSucc;
8218 }
8219 
8220 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8221                                                       VPRecipeBase *PredRecipe,
8222                                                       VPlanPtr &Plan) {
8223   // Instructions marked for predication are replicated and placed under an
8224   // if-then construct to prevent side-effects.
8225 
8226   // Generate recipes to compute the block mask for this region.
8227   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8228 
8229   // Build the triangular if-then region.
8230   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8231   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8232   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8233   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8234   auto *PHIRecipe = Instr->getType()->isVoidTy()
8235                         ? nullptr
8236                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8237   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8238   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8239   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8240 
8241   // Note: first set Entry as region entry and then connect successors starting
8242   // from it in order, to propagate the "parent" of each VPBasicBlock.
8243   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8244   VPBlockUtils::connectBlocks(Pred, Exit);
8245 
8246   return Region;
8247 }
8248 
8249 VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8250                                                       VFRange &Range,
8251                                                       VPlanPtr &Plan) {
8252   // First, check for specific widening recipes that deal with calls, memory
8253   // operations, inductions and Phi nodes.
8254   if (auto *CI = dyn_cast<CallInst>(Instr))
8255     return tryToWidenCall(CI, Range, *Plan);
8256 
8257   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8258     return tryToWidenMemory(Instr, Range, Plan);
8259 
8260   VPRecipeBase *Recipe;
8261   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8262     if (Phi->getParent() != OrigLoop->getHeader())
8263       return tryToBlend(Phi, Plan);
8264     if ((Recipe = tryToOptimizeInductionPHI(Phi)))
8265       return Recipe;
8266     return new VPWidenPHIRecipe(Phi);
8267   }
8268 
8269   if (isa<TruncInst>(Instr) &&
8270       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
8271     return Recipe;
8272 
8273   if (!shouldWiden(Instr, Range))
8274     return nullptr;
8275 
8276   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8277     return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
8278                                 OrigLoop);
8279 
8280   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8281     bool InvariantCond =
8282         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8283     return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
8284                                    InvariantCond);
8285   }
8286 
8287   return tryToWiden(Instr, *Plan);
8288 }
8289 
8290 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8291                                                         ElementCount MaxVF) {
8292   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8293 
8294   // Collect instructions from the original loop that will become trivially dead
8295   // in the vectorized loop. We don't need to vectorize these instructions. For
8296   // example, original induction update instructions can become dead because we
8297   // separately emit induction "steps" when generating code for the new loop.
8298   // Similarly, we create a new latch condition when setting up the structure
8299   // of the new loop, so the old one can become dead.
8300   SmallPtrSet<Instruction *, 4> DeadInstructions;
8301   collectTriviallyDeadInstructions(DeadInstructions);
8302 
8303   // Add assume instructions we need to drop to DeadInstructions, to prevent
8304   // them from being added to the VPlan.
8305   // TODO: We only need to drop assumes in blocks that get flattend. If the
8306   // control flow is preserved, we should keep them.
8307   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8308   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8309 
8310   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8311   // Dead instructions do not need sinking. Remove them from SinkAfter.
8312   for (Instruction *I : DeadInstructions)
8313     SinkAfter.erase(I);
8314 
8315   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8316   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8317     VFRange SubRange = {VF, MaxVFPlusOne};
8318     VPlans.push_back(
8319         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8320     VF = SubRange.End;
8321   }
8322 }
8323 
8324 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8325     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8326     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
8327 
8328   // Hold a mapping from predicated instructions to their recipes, in order to
8329   // fix their AlsoPack behavior if a user is determined to replicate and use a
8330   // scalar instead of vector value.
8331   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
8332 
8333   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8334 
8335   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8336 
8337   // ---------------------------------------------------------------------------
8338   // Pre-construction: record ingredients whose recipes we'll need to further
8339   // process after constructing the initial VPlan.
8340   // ---------------------------------------------------------------------------
8341 
8342   // Mark instructions we'll need to sink later and their targets as
8343   // ingredients whose recipe we'll need to record.
8344   for (auto &Entry : SinkAfter) {
8345     RecipeBuilder.recordRecipeOf(Entry.first);
8346     RecipeBuilder.recordRecipeOf(Entry.second);
8347   }
8348   for (auto &Reduction : CM.getInLoopReductionChains()) {
8349     PHINode *Phi = Reduction.first;
8350     RecurrenceDescriptor::RecurrenceKind Kind =
8351         Legal->getReductionVars()[Phi].getRecurrenceKind();
8352     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8353 
8354     RecipeBuilder.recordRecipeOf(Phi);
8355     for (auto &R : ReductionOperations) {
8356       RecipeBuilder.recordRecipeOf(R);
8357       // For min/max reducitons, where we have a pair of icmp/select, we also
8358       // need to record the ICmp recipe, so it can be removed later.
8359       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8360           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8361         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8362       }
8363     }
8364   }
8365 
8366   // For each interleave group which is relevant for this (possibly trimmed)
8367   // Range, add it to the set of groups to be later applied to the VPlan and add
8368   // placeholders for its members' Recipes which we'll be replacing with a
8369   // single VPInterleaveRecipe.
8370   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8371     auto applyIG = [IG, this](ElementCount VF) -> bool {
8372       return (VF.isVector() && // Query is illegal for VF == 1
8373               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8374                   LoopVectorizationCostModel::CM_Interleave);
8375     };
8376     if (!getDecisionAndClampRange(applyIG, Range))
8377       continue;
8378     InterleaveGroups.insert(IG);
8379     for (unsigned i = 0; i < IG->getFactor(); i++)
8380       if (Instruction *Member = IG->getMember(i))
8381         RecipeBuilder.recordRecipeOf(Member);
8382   };
8383 
8384   // ---------------------------------------------------------------------------
8385   // Build initial VPlan: Scan the body of the loop in a topological order to
8386   // visit each basic block after having visited its predecessor basic blocks.
8387   // ---------------------------------------------------------------------------
8388 
8389   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
8390   auto Plan = std::make_unique<VPlan>();
8391   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
8392   Plan->setEntry(VPBB);
8393 
8394   // Scan the body of the loop in a topological order to visit each basic block
8395   // after having visited its predecessor basic blocks.
8396   LoopBlocksDFS DFS(OrigLoop);
8397   DFS.perform(LI);
8398 
8399   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8400     // Relevant instructions from basic block BB will be grouped into VPRecipe
8401     // ingredients and fill a new VPBasicBlock.
8402     unsigned VPBBsForBB = 0;
8403     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8404     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
8405     VPBB = FirstVPBBForBB;
8406     Builder.setInsertPoint(VPBB);
8407 
8408     // Introduce each ingredient into VPlan.
8409     // TODO: Model and preserve debug instrinsics in VPlan.
8410     for (Instruction &I : BB->instructionsWithoutDebug()) {
8411       Instruction *Instr = &I;
8412 
8413       // First filter out irrelevant instructions, to ensure no recipes are
8414       // built for them.
8415       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8416         continue;
8417 
8418       if (auto Recipe =
8419               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
8420         // Check if the recipe can be converted to a VPValue. We need the extra
8421         // down-casting step until VPRecipeBase inherits from VPValue.
8422         VPValue *MaybeVPValue = Recipe->toVPValue();
8423         if (!Instr->getType()->isVoidTy() && MaybeVPValue)
8424           Plan->addVPValue(Instr, MaybeVPValue);
8425 
8426         RecipeBuilder.setRecipe(Instr, Recipe);
8427         VPBB->appendRecipe(Recipe);
8428         continue;
8429       }
8430 
8431       // Otherwise, if all widening options failed, Instruction is to be
8432       // replicated. This may create a successor for VPBB.
8433       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
8434           Instr, Range, VPBB, PredInst2Recipe, Plan);
8435       if (NextVPBB != VPBB) {
8436         VPBB = NextVPBB;
8437         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8438                                     : "");
8439       }
8440     }
8441   }
8442 
8443   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8444   // may also be empty, such as the last one VPBB, reflecting original
8445   // basic-blocks with no recipes.
8446   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8447   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
8448   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8449   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
8450   delete PreEntry;
8451 
8452   // ---------------------------------------------------------------------------
8453   // Transform initial VPlan: Apply previously taken decisions, in order, to
8454   // bring the VPlan to its final state.
8455   // ---------------------------------------------------------------------------
8456 
8457   // Apply Sink-After legal constraints.
8458   for (auto &Entry : SinkAfter) {
8459     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8460     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8461     Sink->moveAfter(Target);
8462   }
8463 
8464   // Interleave memory: for each Interleave Group we marked earlier as relevant
8465   // for this VPlan, replace the Recipes widening its memory instructions with a
8466   // single VPInterleaveRecipe at its insertion point.
8467   for (auto IG : InterleaveGroups) {
8468     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8469         RecipeBuilder.getRecipe(IG->getInsertPos()));
8470     SmallVector<VPValue *, 4> StoredValues;
8471     for (unsigned i = 0; i < IG->getFactor(); ++i)
8472       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
8473         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
8474 
8475     (new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8476                             Recipe->getMask()))
8477         ->insertBefore(Recipe);
8478 
8479     for (unsigned i = 0; i < IG->getFactor(); ++i)
8480       if (Instruction *Member = IG->getMember(i)) {
8481         if (!Member->getType()->isVoidTy()) {
8482           VPValue *OriginalV = Plan->getVPValue(Member);
8483           Plan->removeVPValueFor(Member);
8484           OriginalV->replaceAllUsesWith(Plan->getOrAddVPValue(Member));
8485         }
8486         RecipeBuilder.getRecipe(Member)->eraseFromParent();
8487       }
8488   }
8489 
8490   // Adjust the recipes for any inloop reductions.
8491   if (Range.Start.isVector())
8492     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
8493 
8494   // Finally, if tail is folded by masking, introduce selects between the phi
8495   // and the live-out instruction of each reduction, at the end of the latch.
8496   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
8497     Builder.setInsertPoint(VPBB);
8498     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
8499     for (auto &Reduction : Legal->getReductionVars()) {
8500       if (CM.isInLoopReduction(Reduction.first))
8501         continue;
8502       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
8503       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
8504       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
8505     }
8506   }
8507 
8508   std::string PlanName;
8509   raw_string_ostream RSO(PlanName);
8510   ElementCount VF = Range.Start;
8511   Plan->addVF(VF);
8512   RSO << "Initial VPlan for VF={" << VF;
8513   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
8514     Plan->addVF(VF);
8515     RSO << "," << VF;
8516   }
8517   RSO << "},UF>=1";
8518   RSO.flush();
8519   Plan->setName(PlanName);
8520 
8521   return Plan;
8522 }
8523 
8524 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8525   // Outer loop handling: They may require CFG and instruction level
8526   // transformations before even evaluating whether vectorization is profitable.
8527   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8528   // the vectorization pipeline.
8529   assert(!OrigLoop->isInnermost());
8530   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8531 
8532   // Create new empty VPlan
8533   auto Plan = std::make_unique<VPlan>();
8534 
8535   // Build hierarchical CFG
8536   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8537   HCFGBuilder.buildHierarchicalCFG();
8538 
8539   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
8540        VF *= 2)
8541     Plan->addVF(VF);
8542 
8543   if (EnableVPlanPredication) {
8544     VPlanPredicator VPP(*Plan);
8545     VPP.predicate();
8546 
8547     // Avoid running transformation to recipes until masked code generation in
8548     // VPlan-native path is in place.
8549     return Plan;
8550   }
8551 
8552   SmallPtrSet<Instruction *, 1> DeadInstructions;
8553   VPlanTransforms::VPInstructionsToVPRecipes(
8554       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
8555   return Plan;
8556 }
8557 
8558 // Adjust the recipes for any inloop reductions. The chain of instructions
8559 // leading from the loop exit instr to the phi need to be converted to
8560 // reductions, with one operand being vector and the other being the scalar
8561 // reduction chain.
8562 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
8563     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
8564   for (auto &Reduction : CM.getInLoopReductionChains()) {
8565     PHINode *Phi = Reduction.first;
8566     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8567     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8568 
8569     // ReductionOperations are orders top-down from the phi's use to the
8570     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
8571     // which of the two operands will remain scalar and which will be reduced.
8572     // For minmax the chain will be the select instructions.
8573     Instruction *Chain = Phi;
8574     for (Instruction *R : ReductionOperations) {
8575       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
8576       RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc.getRecurrenceKind();
8577 
8578       VPValue *ChainOp = Plan->getVPValue(Chain);
8579       unsigned FirstOpId;
8580       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8581           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8582         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
8583                "Expected to replace a VPWidenSelectSC");
8584         FirstOpId = 1;
8585       } else {
8586         assert(isa<VPWidenRecipe>(WidenRecipe) &&
8587                "Expected to replace a VPWidenSC");
8588         FirstOpId = 0;
8589       }
8590       unsigned VecOpId =
8591           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
8592       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
8593 
8594       auto *CondOp = CM.foldTailByMasking()
8595                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
8596                          : nullptr;
8597       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
8598           &RdxDesc, R, ChainOp, VecOp, CondOp, Legal->hasFunNoNaNAttr(), TTI);
8599       WidenRecipe->toVPValue()->replaceAllUsesWith(RedRecipe);
8600       Plan->removeVPValueFor(R);
8601       Plan->addVPValue(R, RedRecipe);
8602       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
8603       WidenRecipe->eraseFromParent();
8604 
8605       if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8606           Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8607         VPRecipeBase *CompareRecipe =
8608             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
8609         assert(isa<VPWidenRecipe>(CompareRecipe) &&
8610                "Expected to replace a VPWidenSC");
8611         assert(CompareRecipe->toVPValue()->getNumUsers() == 0 &&
8612                "Expected no remaining users");
8613         CompareRecipe->eraseFromParent();
8614       }
8615       Chain = R;
8616     }
8617   }
8618 }
8619 
8620 Value* LoopVectorizationPlanner::VPCallbackILV::
8621 getOrCreateVectorValues(Value *V, unsigned Part) {
8622       return ILV.getOrCreateVectorValue(V, Part);
8623 }
8624 
8625 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
8626     Value *V, const VPIteration &Instance) {
8627   return ILV.getOrCreateScalarValue(V, Instance);
8628 }
8629 
8630 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
8631                                VPSlotTracker &SlotTracker) const {
8632   O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
8633   IG->getInsertPos()->printAsOperand(O, false);
8634   O << ", ";
8635   getAddr()->printAsOperand(O, SlotTracker);
8636   VPValue *Mask = getMask();
8637   if (Mask) {
8638     O << ", ";
8639     Mask->printAsOperand(O, SlotTracker);
8640   }
8641   for (unsigned i = 0; i < IG->getFactor(); ++i)
8642     if (Instruction *I = IG->getMember(i))
8643       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
8644 }
8645 
8646 void VPWidenCallRecipe::execute(VPTransformState &State) {
8647   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
8648                                   *this, State);
8649 }
8650 
8651 void VPWidenSelectRecipe::execute(VPTransformState &State) {
8652   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
8653                                     this, *this, InvariantCond, State);
8654 }
8655 
8656 void VPWidenRecipe::execute(VPTransformState &State) {
8657   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
8658 }
8659 
8660 void VPWidenGEPRecipe::execute(VPTransformState &State) {
8661   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
8662                       *this, State.UF, State.VF, IsPtrLoopInvariant,
8663                       IsIndexLoopInvariant, State);
8664 }
8665 
8666 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
8667   assert(!State.Instance && "Int or FP induction being replicated.");
8668   State.ILV->widenIntOrFpInduction(IV, Trunc);
8669 }
8670 
8671 void VPWidenPHIRecipe::execute(VPTransformState &State) {
8672   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
8673 }
8674 
8675 void VPBlendRecipe::execute(VPTransformState &State) {
8676   State.ILV->setDebugLocFromInst(State.Builder, Phi);
8677   // We know that all PHIs in non-header blocks are converted into
8678   // selects, so we don't have to worry about the insertion order and we
8679   // can just use the builder.
8680   // At this point we generate the predication tree. There may be
8681   // duplications since this is a simple recursive scan, but future
8682   // optimizations will clean it up.
8683 
8684   unsigned NumIncoming = getNumIncomingValues();
8685 
8686   // Generate a sequence of selects of the form:
8687   // SELECT(Mask3, In3,
8688   //        SELECT(Mask2, In2,
8689   //               SELECT(Mask1, In1,
8690   //                      In0)))
8691   // Note that Mask0 is never used: lanes for which no path reaches this phi and
8692   // are essentially undef are taken from In0.
8693   InnerLoopVectorizer::VectorParts Entry(State.UF);
8694   for (unsigned In = 0; In < NumIncoming; ++In) {
8695     for (unsigned Part = 0; Part < State.UF; ++Part) {
8696       // We might have single edge PHIs (blocks) - use an identity
8697       // 'select' for the first PHI operand.
8698       Value *In0 = State.get(getIncomingValue(In), Part);
8699       if (In == 0)
8700         Entry[Part] = In0; // Initialize with the first incoming value.
8701       else {
8702         // Select between the current value and the previous incoming edge
8703         // based on the incoming mask.
8704         Value *Cond = State.get(getMask(In), Part);
8705         Entry[Part] =
8706             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
8707       }
8708     }
8709   }
8710   for (unsigned Part = 0; Part < State.UF; ++Part)
8711     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
8712 }
8713 
8714 void VPInterleaveRecipe::execute(VPTransformState &State) {
8715   assert(!State.Instance && "Interleave group being replicated.");
8716   State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getStoredValues(),
8717                                       getMask());
8718 }
8719 
8720 void VPReductionRecipe::execute(VPTransformState &State) {
8721   assert(!State.Instance && "Reduction being replicated.");
8722   for (unsigned Part = 0; Part < State.UF; ++Part) {
8723     RecurrenceDescriptor::RecurrenceKind Kind = RdxDesc->getRecurrenceKind();
8724     Value *NewVecOp = State.get(getVecOp(), Part);
8725     if (VPValue *Cond = getCondOp()) {
8726       Value *NewCond = State.get(Cond, Part);
8727       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
8728       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
8729           Kind, RdxDesc->getMinMaxRecurrenceKind(), VecTy->getElementType());
8730       Constant *IdenVec =
8731           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
8732       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
8733       NewVecOp = Select;
8734     }
8735     Value *NewRed =
8736         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp, NoNaN);
8737     Value *PrevInChain = State.get(getChainOp(), Part);
8738     Value *NextInChain;
8739     if (Kind == RecurrenceDescriptor::RK_IntegerMinMax ||
8740         Kind == RecurrenceDescriptor::RK_FloatMinMax) {
8741       NextInChain =
8742           createMinMaxOp(State.Builder, RdxDesc->getMinMaxRecurrenceKind(),
8743                          NewRed, PrevInChain);
8744     } else {
8745       NextInChain = State.Builder.CreateBinOp(
8746           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
8747           PrevInChain);
8748     }
8749     State.set(this, getUnderlyingInstr(), NextInChain, Part);
8750   }
8751 }
8752 
8753 void VPReplicateRecipe::execute(VPTransformState &State) {
8754   if (State.Instance) { // Generate a single instance.
8755     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
8756     State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
8757                                     *State.Instance, IsPredicated, State);
8758     // Insert scalar instance packing it into a vector.
8759     if (AlsoPack && State.VF.isVector()) {
8760       // If we're constructing lane 0, initialize to start from undef.
8761       if (State.Instance->Lane == 0) {
8762         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
8763         Value *Undef = UndefValue::get(
8764             VectorType::get(getUnderlyingValue()->getType(), State.VF));
8765         State.ValueMap.setVectorValue(getUnderlyingInstr(),
8766                                       State.Instance->Part, Undef);
8767       }
8768       State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
8769                                            *State.Instance);
8770     }
8771     return;
8772   }
8773 
8774   // Generate scalar instances for all VF lanes of all UF parts, unless the
8775   // instruction is uniform inwhich case generate only the first lane for each
8776   // of the UF parts.
8777   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
8778   assert((!State.VF.isScalable() || IsUniform) &&
8779          "Can't scalarize a scalable vector");
8780   for (unsigned Part = 0; Part < State.UF; ++Part)
8781     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
8782       State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
8783                                       IsPredicated, State);
8784 }
8785 
8786 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
8787   assert(State.Instance && "Branch on Mask works only on single instance.");
8788 
8789   unsigned Part = State.Instance->Part;
8790   unsigned Lane = State.Instance->Lane;
8791 
8792   Value *ConditionBit = nullptr;
8793   VPValue *BlockInMask = getMask();
8794   if (BlockInMask) {
8795     ConditionBit = State.get(BlockInMask, Part);
8796     if (ConditionBit->getType()->isVectorTy())
8797       ConditionBit = State.Builder.CreateExtractElement(
8798           ConditionBit, State.Builder.getInt32(Lane));
8799   } else // Block in mask is all-one.
8800     ConditionBit = State.Builder.getTrue();
8801 
8802   // Replace the temporary unreachable terminator with a new conditional branch,
8803   // whose two destinations will be set later when they are created.
8804   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
8805   assert(isa<UnreachableInst>(CurrentTerminator) &&
8806          "Expected to replace unreachable terminator with conditional branch.");
8807   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
8808   CondBr->setSuccessor(0, nullptr);
8809   ReplaceInstWithInst(CurrentTerminator, CondBr);
8810 }
8811 
8812 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
8813   assert(State.Instance && "Predicated instruction PHI works per instance.");
8814   Instruction *ScalarPredInst =
8815       cast<Instruction>(State.get(getOperand(0), *State.Instance));
8816   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
8817   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
8818   assert(PredicatingBB && "Predicated block has no single predecessor.");
8819 
8820   // By current pack/unpack logic we need to generate only a single phi node: if
8821   // a vector value for the predicated instruction exists at this point it means
8822   // the instruction has vector users only, and a phi for the vector value is
8823   // needed. In this case the recipe of the predicated instruction is marked to
8824   // also do that packing, thereby "hoisting" the insert-element sequence.
8825   // Otherwise, a phi node for the scalar value is needed.
8826   unsigned Part = State.Instance->Part;
8827   Instruction *PredInst =
8828       cast<Instruction>(getOperand(0)->getUnderlyingValue());
8829   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
8830     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
8831     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
8832     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
8833     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
8834     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
8835     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
8836   } else {
8837     Type *PredInstType = PredInst->getType();
8838     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
8839     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
8840     Phi->addIncoming(ScalarPredInst, PredicatedBB);
8841     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
8842   }
8843 }
8844 
8845 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
8846   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
8847   State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
8848                                         StoredValue ? nullptr : toVPValue(),
8849                                         getAddr(), StoredValue, getMask());
8850 }
8851 
8852 // Determine how to lower the scalar epilogue, which depends on 1) optimising
8853 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8854 // predication, and 4) a TTI hook that analyses whether the loop is suitable
8855 // for predication.
8856 static ScalarEpilogueLowering getScalarEpilogueLowering(
8857     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
8858     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8859     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
8860     LoopVectorizationLegality &LVL) {
8861   // 1) OptSize takes precedence over all other options, i.e. if this is set,
8862   // don't look at hints or options, and don't request a scalar epilogue.
8863   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
8864   // LoopAccessInfo (due to code dependency and not being able to reliably get
8865   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
8866   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
8867   // versioning when the vectorization is forced, unlike hasOptSize. So revert
8868   // back to the old way and vectorize with versioning when forced. See D81345.)
8869   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
8870                                                       PGSOQueryType::IRPass) &&
8871                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8872     return CM_ScalarEpilogueNotAllowedOptSize;
8873 
8874   // 2) If set, obey the directives
8875   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
8876     switch (PreferPredicateOverEpilogue) {
8877     case PreferPredicateTy::ScalarEpilogue:
8878       return CM_ScalarEpilogueAllowed;
8879     case PreferPredicateTy::PredicateElseScalarEpilogue:
8880       return CM_ScalarEpilogueNotNeededUsePredicate;
8881     case PreferPredicateTy::PredicateOrDontVectorize:
8882       return CM_ScalarEpilogueNotAllowedUsePredicate;
8883     };
8884   }
8885 
8886   // 3) If set, obey the hints
8887   switch (Hints.getPredicate()) {
8888   case LoopVectorizeHints::FK_Enabled:
8889     return CM_ScalarEpilogueNotNeededUsePredicate;
8890   case LoopVectorizeHints::FK_Disabled:
8891     return CM_ScalarEpilogueAllowed;
8892   };
8893 
8894   // 4) if the TTI hook indicates this is profitable, request predication.
8895   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
8896                                        LVL.getLAI()))
8897     return CM_ScalarEpilogueNotNeededUsePredicate;
8898 
8899   return CM_ScalarEpilogueAllowed;
8900 }
8901 
8902 void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
8903                            unsigned Part) {
8904   set(Def, V, Part);
8905   ILV->setVectorValue(IRDef, Part, V);
8906 }
8907 
8908 // Process the loop in the VPlan-native vectorization path. This path builds
8909 // VPlan upfront in the vectorization pipeline, which allows to apply
8910 // VPlan-to-VPlan transformations from the very beginning without modifying the
8911 // input LLVM IR.
8912 static bool processLoopInVPlanNativePath(
8913     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8914     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8915     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8916     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
8917     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
8918 
8919   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
8920     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8921     return false;
8922   }
8923   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8924   Function *F = L->getHeader()->getParent();
8925   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8926 
8927   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
8928       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
8929 
8930   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
8931                                 &Hints, IAI);
8932   // Use the planner for outer loop vectorization.
8933   // TODO: CM is not used at this point inside the planner. Turn CM into an
8934   // optional argument if we don't need it in the future.
8935   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
8936 
8937   // Get user vectorization factor.
8938   ElementCount UserVF = Hints.getWidth();
8939 
8940   // Plan how to best vectorize, return the best VF and its cost.
8941   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
8942 
8943   // If we are stress testing VPlan builds, do not attempt to generate vector
8944   // code. Masked vector code generation support will follow soon.
8945   // Also, do not attempt to vectorize if no vector code will be produced.
8946   if (VPlanBuildStressTest || EnableVPlanPredication ||
8947       VectorizationFactor::Disabled() == VF)
8948     return false;
8949 
8950   LVP.setBestPlan(VF.Width, 1);
8951 
8952   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
8953                          &CM, BFI, PSI);
8954   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8955                     << L->getHeader()->getParent()->getName() << "\"\n");
8956   LVP.executePlan(LB, DT);
8957 
8958   // Mark the loop as already vectorized to avoid vectorizing again.
8959   Hints.setAlreadyVectorized();
8960 
8961   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8962   return true;
8963 }
8964 
8965 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8966     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8967                                !EnableLoopInterleaving),
8968       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8969                               !EnableLoopVectorization) {}
8970 
8971 bool LoopVectorizePass::processLoop(Loop *L) {
8972   assert((EnableVPlanNativePath || L->isInnermost()) &&
8973          "VPlan-native path is not enabled. Only process inner loops.");
8974 
8975 #ifndef NDEBUG
8976   const std::string DebugLocStr = getDebugLocString(L);
8977 #endif /* NDEBUG */
8978 
8979   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
8980                     << L->getHeader()->getParent()->getName() << "\" from "
8981                     << DebugLocStr << "\n");
8982 
8983   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
8984 
8985   LLVM_DEBUG(
8986       dbgs() << "LV: Loop hints:"
8987              << " force="
8988              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
8989                      ? "disabled"
8990                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
8991                             ? "enabled"
8992                             : "?"))
8993              << " width=" << Hints.getWidth()
8994              << " unroll=" << Hints.getInterleave() << "\n");
8995 
8996   // Function containing loop
8997   Function *F = L->getHeader()->getParent();
8998 
8999   // Looking at the diagnostic output is the only way to determine if a loop
9000   // was vectorized (other than looking at the IR or machine code), so it
9001   // is important to generate an optimization remark for each loop. Most of
9002   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9003   // generated as OptimizationRemark and OptimizationRemarkMissed are
9004   // less verbose reporting vectorized loops and unvectorized loops that may
9005   // benefit from vectorization, respectively.
9006 
9007   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9008     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9009     return false;
9010   }
9011 
9012   PredicatedScalarEvolution PSE(*SE, *L);
9013 
9014   // Check if it is legal to vectorize the loop.
9015   LoopVectorizationRequirements Requirements(*ORE);
9016   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
9017                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9018   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9019     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9020     Hints.emitRemarkWithHints();
9021     return false;
9022   }
9023 
9024   // Check the function attributes and profiles to find out if this function
9025   // should be optimized for size.
9026   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9027       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
9028 
9029   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9030   // here. They may require CFG and instruction level transformations before
9031   // even evaluating whether vectorization is profitable. Since we cannot modify
9032   // the incoming IR, we need to build VPlan upfront in the vectorization
9033   // pipeline.
9034   if (!L->isInnermost())
9035     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9036                                         ORE, BFI, PSI, Hints);
9037 
9038   assert(L->isInnermost() && "Inner loop expected.");
9039 
9040   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9041   // count by optimizing for size, to minimize overheads.
9042   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9043   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9044     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9045                       << "This loop is worth vectorizing only if no scalar "
9046                       << "iteration overheads are incurred.");
9047     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9048       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9049     else {
9050       LLVM_DEBUG(dbgs() << "\n");
9051       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9052     }
9053   }
9054 
9055   // Check the function attributes to see if implicit floats are allowed.
9056   // FIXME: This check doesn't seem possibly correct -- what if the loop is
9057   // an integer loop and the vector instructions selected are purely integer
9058   // vector instructions?
9059   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9060     reportVectorizationFailure(
9061         "Can't vectorize when the NoImplicitFloat attribute is used",
9062         "loop not vectorized due to NoImplicitFloat attribute",
9063         "NoImplicitFloat", ORE, L);
9064     Hints.emitRemarkWithHints();
9065     return false;
9066   }
9067 
9068   // Check if the target supports potentially unsafe FP vectorization.
9069   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9070   // for the target we're vectorizing for, to make sure none of the
9071   // additional fp-math flags can help.
9072   if (Hints.isPotentiallyUnsafe() &&
9073       TTI->isFPVectorizationPotentiallyUnsafe()) {
9074     reportVectorizationFailure(
9075         "Potentially unsafe FP op prevents vectorization",
9076         "loop not vectorized due to unsafe FP support.",
9077         "UnsafeFP", ORE, L);
9078     Hints.emitRemarkWithHints();
9079     return false;
9080   }
9081 
9082   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9083   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9084 
9085   // If an override option has been passed in for interleaved accesses, use it.
9086   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9087     UseInterleaved = EnableInterleavedMemAccesses;
9088 
9089   // Analyze interleaved memory accesses.
9090   if (UseInterleaved) {
9091     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9092   }
9093 
9094   // Use the cost model.
9095   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9096                                 F, &Hints, IAI);
9097   CM.collectValuesToIgnore();
9098 
9099   // Use the planner for vectorization.
9100   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
9101 
9102   // Get user vectorization factor and interleave count.
9103   ElementCount UserVF = Hints.getWidth();
9104   unsigned UserIC = Hints.getInterleave();
9105 
9106   // Plan how to best vectorize, return the best VF and its cost.
9107   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9108 
9109   VectorizationFactor VF = VectorizationFactor::Disabled();
9110   unsigned IC = 1;
9111 
9112   if (MaybeVF) {
9113     VF = *MaybeVF;
9114     // Select the interleave count.
9115     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9116   }
9117 
9118   // Identify the diagnostic messages that should be produced.
9119   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9120   bool VectorizeLoop = true, InterleaveLoop = true;
9121   if (Requirements.doesNotMeet(F, L, Hints)) {
9122     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
9123                          "requirements.\n");
9124     Hints.emitRemarkWithHints();
9125     return false;
9126   }
9127 
9128   if (VF.Width.isScalar()) {
9129     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9130     VecDiagMsg = std::make_pair(
9131         "VectorizationNotBeneficial",
9132         "the cost-model indicates that vectorization is not beneficial");
9133     VectorizeLoop = false;
9134   }
9135 
9136   if (!MaybeVF && UserIC > 1) {
9137     // Tell the user interleaving was avoided up-front, despite being explicitly
9138     // requested.
9139     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9140                          "interleaving should be avoided up front\n");
9141     IntDiagMsg = std::make_pair(
9142         "InterleavingAvoided",
9143         "Ignoring UserIC, because interleaving was avoided up front");
9144     InterleaveLoop = false;
9145   } else if (IC == 1 && UserIC <= 1) {
9146     // Tell the user interleaving is not beneficial.
9147     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9148     IntDiagMsg = std::make_pair(
9149         "InterleavingNotBeneficial",
9150         "the cost-model indicates that interleaving is not beneficial");
9151     InterleaveLoop = false;
9152     if (UserIC == 1) {
9153       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9154       IntDiagMsg.second +=
9155           " and is explicitly disabled or interleave count is set to 1";
9156     }
9157   } else if (IC > 1 && UserIC == 1) {
9158     // Tell the user interleaving is beneficial, but it explicitly disabled.
9159     LLVM_DEBUG(
9160         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9161     IntDiagMsg = std::make_pair(
9162         "InterleavingBeneficialButDisabled",
9163         "the cost-model indicates that interleaving is beneficial "
9164         "but is explicitly disabled or interleave count is set to 1");
9165     InterleaveLoop = false;
9166   }
9167 
9168   // Override IC if user provided an interleave count.
9169   IC = UserIC > 0 ? UserIC : IC;
9170 
9171   // Emit diagnostic messages, if any.
9172   const char *VAPassName = Hints.vectorizeAnalysisPassName();
9173   if (!VectorizeLoop && !InterleaveLoop) {
9174     // Do not vectorize or interleaving the loop.
9175     ORE->emit([&]() {
9176       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9177                                       L->getStartLoc(), L->getHeader())
9178              << VecDiagMsg.second;
9179     });
9180     ORE->emit([&]() {
9181       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9182                                       L->getStartLoc(), L->getHeader())
9183              << IntDiagMsg.second;
9184     });
9185     return false;
9186   } else if (!VectorizeLoop && InterleaveLoop) {
9187     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9188     ORE->emit([&]() {
9189       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9190                                         L->getStartLoc(), L->getHeader())
9191              << VecDiagMsg.second;
9192     });
9193   } else if (VectorizeLoop && !InterleaveLoop) {
9194     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9195                       << ") in " << DebugLocStr << '\n');
9196     ORE->emit([&]() {
9197       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9198                                         L->getStartLoc(), L->getHeader())
9199              << IntDiagMsg.second;
9200     });
9201   } else if (VectorizeLoop && InterleaveLoop) {
9202     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9203                       << ") in " << DebugLocStr << '\n');
9204     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9205   }
9206 
9207   LVP.setBestPlan(VF.Width, IC);
9208 
9209   using namespace ore;
9210   bool DisableRuntimeUnroll = false;
9211   MDNode *OrigLoopID = L->getLoopID();
9212 
9213   if (!VectorizeLoop) {
9214     assert(IC > 1 && "interleave count should not be 1 or 0");
9215     // If we decided that it is not legal to vectorize the loop, then
9216     // interleave it.
9217     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
9218                                BFI, PSI);
9219     LVP.executePlan(Unroller, DT);
9220 
9221     ORE->emit([&]() {
9222       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9223                                 L->getHeader())
9224              << "interleaved loop (interleaved count: "
9225              << NV("InterleaveCount", IC) << ")";
9226     });
9227   } else {
9228     // If we decided that it is *legal* to vectorize the loop, then do it.
9229 
9230     // Consider vectorizing the epilogue too if it's profitable.
9231     VectorizationFactor EpilogueVF =
9232       CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
9233     if (EpilogueVF.Width.isVector()) {
9234 
9235       // The first pass vectorizes the main loop and creates a scalar epilogue
9236       // to be vectorized by executing the plan (potentially with a different
9237       // factor) again shortly afterwards.
9238       EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
9239                                         EpilogueVF.Width.getKnownMinValue(), 1);
9240       EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI,
9241                                          &LVL, &CM, BFI, PSI);
9242 
9243       LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
9244       LVP.executePlan(MainILV, DT);
9245       ++LoopsVectorized;
9246 
9247       simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9248       formLCSSARecursively(*L, *DT, LI, SE);
9249 
9250       // Second pass vectorizes the epilogue and adjusts the control flow
9251       // edges from the first pass.
9252       LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
9253       EPI.MainLoopVF = EPI.EpilogueVF;
9254       EPI.MainLoopUF = EPI.EpilogueUF;
9255       EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9256                                                ORE, EPI, &LVL, &CM, BFI, PSI);
9257       LVP.executePlan(EpilogILV, DT);
9258       ++LoopsEpilogueVectorized;
9259 
9260       if (!MainILV.areSafetyChecksAdded())
9261         DisableRuntimeUnroll = true;
9262     } else {
9263       InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
9264                              &LVL, &CM, BFI, PSI);
9265       LVP.executePlan(LB, DT);
9266       ++LoopsVectorized;
9267 
9268       // Add metadata to disable runtime unrolling a scalar loop when there are
9269       // no runtime checks about strides and memory. A scalar loop that is
9270       // rarely used is not worth unrolling.
9271       if (!LB.areSafetyChecksAdded())
9272         DisableRuntimeUnroll = true;
9273     }
9274 
9275     // Report the vectorization decision.
9276     ORE->emit([&]() {
9277       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
9278                                 L->getHeader())
9279              << "vectorized loop (vectorization width: "
9280              << NV("VectorizationFactor", VF.Width)
9281              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
9282     });
9283   }
9284 
9285   Optional<MDNode *> RemainderLoopID =
9286       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
9287                                       LLVMLoopVectorizeFollowupEpilogue});
9288   if (RemainderLoopID.hasValue()) {
9289     L->setLoopID(RemainderLoopID.getValue());
9290   } else {
9291     if (DisableRuntimeUnroll)
9292       AddRuntimeUnrollDisableMetaData(L);
9293 
9294     // Mark the loop as already vectorized to avoid vectorizing again.
9295     Hints.setAlreadyVectorized();
9296   }
9297 
9298   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9299   return true;
9300 }
9301 
9302 LoopVectorizeResult LoopVectorizePass::runImpl(
9303     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
9304     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
9305     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
9306     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
9307     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
9308   SE = &SE_;
9309   LI = &LI_;
9310   TTI = &TTI_;
9311   DT = &DT_;
9312   BFI = &BFI_;
9313   TLI = TLI_;
9314   AA = &AA_;
9315   AC = &AC_;
9316   GetLAA = &GetLAA_;
9317   DB = &DB_;
9318   ORE = &ORE_;
9319   PSI = PSI_;
9320 
9321   // Don't attempt if
9322   // 1. the target claims to have no vector registers, and
9323   // 2. interleaving won't help ILP.
9324   //
9325   // The second condition is necessary because, even if the target has no
9326   // vector registers, loop vectorization may still enable scalar
9327   // interleaving.
9328   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9329       TTI->getMaxInterleaveFactor(1) < 2)
9330     return LoopVectorizeResult(false, false);
9331 
9332   bool Changed = false, CFGChanged = false;
9333 
9334   // The vectorizer requires loops to be in simplified form.
9335   // Since simplification may add new inner loops, it has to run before the
9336   // legality and profitability checks. This means running the loop vectorizer
9337   // will simplify all loops, regardless of whether anything end up being
9338   // vectorized.
9339   for (auto &L : *LI)
9340     Changed |= CFGChanged |=
9341         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9342 
9343   // Build up a worklist of inner-loops to vectorize. This is necessary as
9344   // the act of vectorizing or partially unrolling a loop creates new loops
9345   // and can invalidate iterators across the loops.
9346   SmallVector<Loop *, 8> Worklist;
9347 
9348   for (Loop *L : *LI)
9349     collectSupportedLoops(*L, LI, ORE, Worklist);
9350 
9351   LoopsAnalyzed += Worklist.size();
9352 
9353   // Now walk the identified inner loops.
9354   while (!Worklist.empty()) {
9355     Loop *L = Worklist.pop_back_val();
9356 
9357     // For the inner loops we actually process, form LCSSA to simplify the
9358     // transform.
9359     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
9360 
9361     Changed |= CFGChanged |= processLoop(L);
9362   }
9363 
9364   // Process each loop nest in the function.
9365   return LoopVectorizeResult(Changed, CFGChanged);
9366 }
9367 
9368 PreservedAnalyses LoopVectorizePass::run(Function &F,
9369                                          FunctionAnalysisManager &AM) {
9370     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
9371     auto &LI = AM.getResult<LoopAnalysis>(F);
9372     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
9373     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
9374     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
9375     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
9376     auto &AA = AM.getResult<AAManager>(F);
9377     auto &AC = AM.getResult<AssumptionAnalysis>(F);
9378     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
9379     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
9380     MemorySSA *MSSA = EnableMSSALoopDependency
9381                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
9382                           : nullptr;
9383 
9384     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
9385     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
9386         [&](Loop &L) -> const LoopAccessInfo & {
9387       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
9388                                         TLI, TTI, nullptr, MSSA};
9389       return LAM.getResult<LoopAccessAnalysis>(L, AR);
9390     };
9391     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
9392     ProfileSummaryInfo *PSI =
9393         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
9394     LoopVectorizeResult Result =
9395         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
9396     if (!Result.MadeAnyChange)
9397       return PreservedAnalyses::all();
9398     PreservedAnalyses PA;
9399 
9400     // We currently do not preserve loopinfo/dominator analyses with outer loop
9401     // vectorization. Until this is addressed, mark these analyses as preserved
9402     // only for non-VPlan-native path.
9403     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
9404     if (!EnableVPlanNativePath) {
9405       PA.preserve<LoopAnalysis>();
9406       PA.preserve<DominatorTreeAnalysis>();
9407     }
9408     PA.preserve<BasicAA>();
9409     PA.preserve<GlobalsAA>();
9410     if (!Result.MadeCFGChange)
9411       PA.preserveSet<CFGAnalyses>();
9412     return PA;
9413 }
9414