1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallVector.h"
74 #include "llvm/ADT/Statistic.h"
75 #include "llvm/ADT/StringRef.h"
76 #include "llvm/ADT/Twine.h"
77 #include "llvm/ADT/iterator_range.h"
78 #include "llvm/Analysis/AssumptionCache.h"
79 #include "llvm/Analysis/BasicAliasAnalysis.h"
80 #include "llvm/Analysis/BlockFrequencyInfo.h"
81 #include "llvm/Analysis/CFG.h"
82 #include "llvm/Analysis/CodeMetrics.h"
83 #include "llvm/Analysis/DemandedBits.h"
84 #include "llvm/Analysis/GlobalsModRef.h"
85 #include "llvm/Analysis/LoopAccessAnalysis.h"
86 #include "llvm/Analysis/LoopAnalysisManager.h"
87 #include "llvm/Analysis/LoopInfo.h"
88 #include "llvm/Analysis/LoopIterator.h"
89 #include "llvm/Analysis/MemorySSA.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/LLVMContext.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/Type.h"
120 #include "llvm/IR/Use.h"
121 #include "llvm/IR/User.h"
122 #include "llvm/IR/Value.h"
123 #include "llvm/IR/ValueHandle.h"
124 #include "llvm/IR/Verifier.h"
125 #include "llvm/InitializePasses.h"
126 #include "llvm/Pass.h"
127 #include "llvm/Support/Casting.h"
128 #include "llvm/Support/CommandLine.h"
129 #include "llvm/Support/Compiler.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <cstdlib>
147 #include <functional>
148 #include <iterator>
149 #include <limits>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 #ifndef NDEBUG
161 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
162 #endif
163 
164 /// @{
165 /// Metadata attribute names
166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167 const char LLVMLoopVectorizeFollowupVectorized[] =
168     "llvm.loop.vectorize.followup_vectorized";
169 const char LLVMLoopVectorizeFollowupEpilogue[] =
170     "llvm.loop.vectorize.followup_epilogue";
171 /// @}
172 
173 STATISTIC(LoopsVectorized, "Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176 
177 static cl::opt<bool> EnableEpilogueVectorization(
178     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179     cl::desc("Enable vectorization of epilogue loops."));
180 
181 static cl::opt<unsigned> EpilogueVectorizationForceVF(
182     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183     cl::desc("When epilogue vectorization is enabled, and a value greater than "
184              "1 is specified, forces the given VF for all applicable epilogue "
185              "loops."));
186 
187 static cl::opt<unsigned> EpilogueVectorizationMinVF(
188     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189     cl::desc("Only loops with vectorization factor equal to or larger than "
190              "the specified value are considered for epilogue vectorization."));
191 
192 /// Loops with a known constant trip count below this number are vectorized only
193 /// if no scalar iteration overheads are incurred.
194 static cl::opt<unsigned> TinyTripCountVectorThreshold(
195     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196     cl::desc("Loops with a constant trip count that is smaller than this "
197              "value are vectorized only if no scalar iteration overheads "
198              "are incurred."));
199 
200 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
201 // that predication is preferred, and this lists all options. I.e., the
202 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
203 // and predicate the instructions accordingly. If tail-folding fails, there are
204 // different fallback strategies depending on these values:
205 namespace PreferPredicateTy {
206   enum Option {
207     ScalarEpilogue = 0,
208     PredicateElseScalarEpilogue,
209     PredicateOrDontVectorize
210   };
211 } // namespace PreferPredicateTy
212 
213 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
214     "prefer-predicate-over-epilogue",
215     cl::init(PreferPredicateTy::ScalarEpilogue),
216     cl::Hidden,
217     cl::desc("Tail-folding and predication preferences over creating a scalar "
218              "epilogue loop."),
219     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
220                          "scalar-epilogue",
221                          "Don't tail-predicate loops, create scalar epilogue"),
222               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
223                          "predicate-else-scalar-epilogue",
224                          "prefer tail-folding, create scalar epilogue if tail "
225                          "folding fails."),
226               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
227                          "predicate-dont-vectorize",
228                          "prefers tail-folding, don't attempt vectorization if "
229                          "tail-folding fails.")));
230 
231 static cl::opt<bool> MaximizeBandwidth(
232     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
233     cl::desc("Maximize bandwidth when selecting vectorization factor which "
234              "will be determined by the smallest type in loop."));
235 
236 static cl::opt<bool> EnableInterleavedMemAccesses(
237     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
238     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
239 
240 /// An interleave-group may need masking if it resides in a block that needs
241 /// predication, or in order to mask away gaps.
242 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
243     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
245 
246 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
247     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
248     cl::desc("We don't interleave loops with a estimated constant trip count "
249              "below this number"));
250 
251 static cl::opt<unsigned> ForceTargetNumScalarRegs(
252     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
253     cl::desc("A flag that overrides the target's number of scalar registers."));
254 
255 static cl::opt<unsigned> ForceTargetNumVectorRegs(
256     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of vector registers."));
258 
259 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
260     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's max interleave factor for "
262              "scalar loops."));
263 
264 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
265     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
266     cl::desc("A flag that overrides the target's max interleave factor for "
267              "vectorized loops."));
268 
269 static cl::opt<unsigned> ForceTargetInstructionCost(
270     "force-target-instruction-cost", cl::init(0), cl::Hidden,
271     cl::desc("A flag that overrides the target's expected cost for "
272              "an instruction to a single constant value. Mostly "
273              "useful for getting consistent testing."));
274 
275 static cl::opt<bool> ForceTargetSupportsScalableVectors(
276     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
277     cl::desc(
278         "Pretend that scalable vectors are supported, even if the target does "
279         "not support them. This flag should only be used for testing."));
280 
281 static cl::opt<unsigned> SmallLoopCost(
282     "small-loop-cost", cl::init(20), cl::Hidden,
283     cl::desc(
284         "The cost of a loop that is considered 'small' by the interleaver."));
285 
286 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
287     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
288     cl::desc("Enable the use of the block frequency analysis to access PGO "
289              "heuristics minimizing code growth in cold regions and being more "
290              "aggressive in hot regions."));
291 
292 // Runtime interleave loops for load/store throughput.
293 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
294     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
295     cl::desc(
296         "Enable runtime interleaving until load/store ports are saturated"));
297 
298 /// Interleave small loops with scalar reductions.
299 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
300     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
301     cl::desc("Enable interleaving for loops with small iteration counts that "
302              "contain scalar reductions to expose ILP."));
303 
304 /// The number of stores in a loop that are allowed to need predication.
305 static cl::opt<unsigned> NumberOfStoresToPredicate(
306     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
307     cl::desc("Max number of stores to be predicated behind an if."));
308 
309 static cl::opt<bool> EnableIndVarRegisterHeur(
310     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
311     cl::desc("Count the induction variable only once when interleaving"));
312 
313 static cl::opt<bool> EnableCondStoresVectorization(
314     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
315     cl::desc("Enable if predication of stores during vectorization."));
316 
317 static cl::opt<unsigned> MaxNestedScalarReductionIC(
318     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
319     cl::desc("The maximum interleave count to use when interleaving a scalar "
320              "reduction in a nested loop."));
321 
322 static cl::opt<bool>
323     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
324                            cl::Hidden,
325                            cl::desc("Prefer in-loop vector reductions, "
326                                     "overriding the targets preference."));
327 
328 static cl::opt<bool> PreferPredicatedReductionSelect(
329     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
330     cl::desc(
331         "Prefer predicating a reduction operation over an after loop select."));
332 
333 cl::opt<bool> EnableVPlanNativePath(
334     "enable-vplan-native-path", cl::init(false), cl::Hidden,
335     cl::desc("Enable VPlan-native vectorization path with "
336              "support for outer loop vectorization."));
337 
338 // FIXME: Remove this switch once we have divergence analysis. Currently we
339 // assume divergent non-backedge branches when this switch is true.
340 cl::opt<bool> EnableVPlanPredication(
341     "enable-vplan-predication", cl::init(false), cl::Hidden,
342     cl::desc("Enable VPlan-native vectorization path predicator with "
343              "support for outer loop vectorization."));
344 
345 // This flag enables the stress testing of the VPlan H-CFG construction in the
346 // VPlan-native vectorization path. It must be used in conjuction with
347 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
348 // verification of the H-CFGs built.
349 static cl::opt<bool> VPlanBuildStressTest(
350     "vplan-build-stress-test", cl::init(false), cl::Hidden,
351     cl::desc(
352         "Build VPlan for every supported loop nest in the function and bail "
353         "out right after the build (stress test the VPlan H-CFG construction "
354         "in the VPlan-native vectorization path)."));
355 
356 cl::opt<bool> llvm::EnableLoopInterleaving(
357     "interleave-loops", cl::init(true), cl::Hidden,
358     cl::desc("Enable loop interleaving in Loop vectorization passes"));
359 cl::opt<bool> llvm::EnableLoopVectorization(
360     "vectorize-loops", cl::init(true), cl::Hidden,
361     cl::desc("Run the Loop vectorization passes"));
362 
363 /// A helper function that returns the type of loaded or stored value.
364 static Type *getMemInstValueType(Value *I) {
365   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
366          "Expected Load or Store instruction");
367   if (auto *LI = dyn_cast<LoadInst>(I))
368     return LI->getType();
369   return cast<StoreInst>(I)->getValueOperand()->getType();
370 }
371 
372 /// A helper function that returns true if the given type is irregular. The
373 /// type is irregular if its allocated size doesn't equal the store size of an
374 /// element of the corresponding vector type at the given vectorization factor.
375 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
376   // Determine if an array of VF elements of type Ty is "bitcast compatible"
377   // with a <VF x Ty> vector.
378   if (VF.isVector()) {
379     auto *VectorTy = VectorType::get(Ty, VF);
380     return TypeSize::get(VF.getKnownMinValue() *
381                              DL.getTypeAllocSize(Ty).getFixedValue(),
382                          VF.isScalable()) != DL.getTypeStoreSize(VectorTy);
383   }
384 
385   // If the vectorization factor is one, we just check if an array of type Ty
386   // requires padding between elements.
387   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
388 }
389 
390 /// A helper function that returns the reciprocal of the block probability of
391 /// predicated blocks. If we return X, we are assuming the predicated block
392 /// will execute once for every X iterations of the loop header.
393 ///
394 /// TODO: We should use actual block probability here, if available. Currently,
395 ///       we always assume predicated blocks have a 50% chance of executing.
396 static unsigned getReciprocalPredBlockProb() { return 2; }
397 
398 /// A helper function that returns an integer or floating-point constant with
399 /// value C.
400 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
401   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
402                            : ConstantFP::get(Ty, C);
403 }
404 
405 /// Returns "best known" trip count for the specified loop \p L as defined by
406 /// the following procedure:
407 ///   1) Returns exact trip count if it is known.
408 ///   2) Returns expected trip count according to profile data if any.
409 ///   3) Returns upper bound estimate if it is known.
410 ///   4) Returns None if all of the above failed.
411 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
412   // Check if exact trip count is known.
413   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
414     return ExpectedTC;
415 
416   // Check if there is an expected trip count available from profile data.
417   if (LoopVectorizeWithBlockFrequency)
418     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
419       return EstimatedTC;
420 
421   // Check if upper bound estimate is known.
422   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
423     return ExpectedTC;
424 
425   return None;
426 }
427 
428 // Forward declare GeneratedRTChecks.
429 class GeneratedRTChecks;
430 
431 namespace llvm {
432 
433 /// InnerLoopVectorizer vectorizes loops which contain only one basic
434 /// block to a specified vectorization factor (VF).
435 /// This class performs the widening of scalars into vectors, or multiple
436 /// scalars. This class also implements the following features:
437 /// * It inserts an epilogue loop for handling loops that don't have iteration
438 ///   counts that are known to be a multiple of the vectorization factor.
439 /// * It handles the code generation for reduction variables.
440 /// * Scalarization (implementation using scalars) of un-vectorizable
441 ///   instructions.
442 /// InnerLoopVectorizer does not perform any vectorization-legality
443 /// checks, and relies on the caller to check for the different legality
444 /// aspects. The InnerLoopVectorizer relies on the
445 /// LoopVectorizationLegality class to provide information about the induction
446 /// and reduction variables that were found to a given vectorization factor.
447 class InnerLoopVectorizer {
448 public:
449   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
450                       LoopInfo *LI, DominatorTree *DT,
451                       const TargetLibraryInfo *TLI,
452                       const TargetTransformInfo *TTI, AssumptionCache *AC,
453                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
454                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
455                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
456                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
457       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
458         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
459         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
460         PSI(PSI), RTChecks(RTChecks) {
461     // Query this against the original loop and save it here because the profile
462     // of the original loop header may change as the transformation happens.
463     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
464         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
465   }
466 
467   virtual ~InnerLoopVectorizer() = default;
468 
469   /// Create a new empty loop that will contain vectorized instructions later
470   /// on, while the old loop will be used as the scalar remainder. Control flow
471   /// is generated around the vectorized (and scalar epilogue) loops consisting
472   /// of various checks and bypasses. Return the pre-header block of the new
473   /// loop.
474   /// In the case of epilogue vectorization, this function is overriden to
475   /// handle the more complex control flow around the loops.
476   virtual BasicBlock *createVectorizedLoopSkeleton();
477 
478   /// Widen a single instruction within the innermost loop.
479   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
480                         VPTransformState &State);
481 
482   /// Widen a single call instruction within the innermost loop.
483   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
484                             VPTransformState &State);
485 
486   /// Widen a single select instruction within the innermost loop.
487   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
488                               bool InvariantCond, VPTransformState &State);
489 
490   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
491   void fixVectorizedLoop(VPTransformState &State);
492 
493   // Return true if any runtime check is added.
494   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
495 
496   /// A type for vectorized values in the new loop. Each value from the
497   /// original loop, when vectorized, is represented by UF vector values in the
498   /// new unrolled loop, where UF is the unroll factor.
499   using VectorParts = SmallVector<Value *, 2>;
500 
501   /// Vectorize a single GetElementPtrInst based on information gathered and
502   /// decisions taken during planning.
503   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
504                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
505                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
506 
507   /// Vectorize a single PHINode in a block. This method handles the induction
508   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
509   /// arbitrary length vectors.
510   void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
511                            VPValue *StartV, VPValue *Def,
512                            VPTransformState &State);
513 
514   /// A helper function to scalarize a single Instruction in the innermost loop.
515   /// Generates a sequence of scalar instances for each lane between \p MinLane
516   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
517   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
518   /// Instr's operands.
519   void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
520                             const VPIteration &Instance, bool IfPredicateInstr,
521                             VPTransformState &State);
522 
523   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
524   /// is provided, the integer induction variable will first be truncated to
525   /// the corresponding type.
526   void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
527                              VPValue *Def, VPValue *CastDef,
528                              VPTransformState &State);
529 
530   /// Construct the vector value of a scalarized value \p V one lane at a time.
531   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
532                                  VPTransformState &State);
533 
534   /// Try to vectorize interleaved access group \p Group with the base address
535   /// given in \p Addr, optionally masking the vector operations if \p
536   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
537   /// values in the vectorized loop.
538   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
539                                 ArrayRef<VPValue *> VPDefs,
540                                 VPTransformState &State, VPValue *Addr,
541                                 ArrayRef<VPValue *> StoredValues,
542                                 VPValue *BlockInMask = nullptr);
543 
544   /// Vectorize Load and Store instructions with the base address given in \p
545   /// Addr, optionally masking the vector operations if \p BlockInMask is
546   /// non-null. Use \p State to translate given VPValues to IR values in the
547   /// vectorized loop.
548   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
549                                   VPValue *Def, VPValue *Addr,
550                                   VPValue *StoredValue, VPValue *BlockInMask);
551 
552   /// Set the debug location in the builder using the debug location in
553   /// the instruction.
554   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
555 
556   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
557   void fixNonInductionPHIs(VPTransformState &State);
558 
559   /// Create a broadcast instruction. This method generates a broadcast
560   /// instruction (shuffle) for loop invariant values and for the induction
561   /// value. If this is the induction variable then we extend it to N, N+1, ...
562   /// this is needed because each iteration in the loop corresponds to a SIMD
563   /// element.
564   virtual Value *getBroadcastInstrs(Value *V);
565 
566 protected:
567   friend class LoopVectorizationPlanner;
568 
569   /// A small list of PHINodes.
570   using PhiVector = SmallVector<PHINode *, 4>;
571 
572   /// A type for scalarized values in the new loop. Each value from the
573   /// original loop, when scalarized, is represented by UF x VF scalar values
574   /// in the new unrolled loop, where UF is the unroll factor and VF is the
575   /// vectorization factor.
576   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
577 
578   /// Set up the values of the IVs correctly when exiting the vector loop.
579   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
580                     Value *CountRoundDown, Value *EndValue,
581                     BasicBlock *MiddleBlock);
582 
583   /// Create a new induction variable inside L.
584   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
585                                    Value *Step, Instruction *DL);
586 
587   /// Handle all cross-iteration phis in the header.
588   void fixCrossIterationPHIs(VPTransformState &State);
589 
590   /// Fix a first-order recurrence. This is the second phase of vectorizing
591   /// this phi node.
592   void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State);
593 
594   /// Fix a reduction cross-iteration phi. This is the second phase of
595   /// vectorizing this phi node.
596   void fixReduction(PHINode *Phi, VPTransformState &State);
597 
598   /// Clear NSW/NUW flags from reduction instructions if necessary.
599   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc,
600                                VPTransformState &State);
601 
602   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
603   /// means we need to add the appropriate incoming value from the middle
604   /// block as exiting edges from the scalar epilogue loop (if present) are
605   /// already in place, and we exit the vector loop exclusively to the middle
606   /// block.
607   void fixLCSSAPHIs(VPTransformState &State);
608 
609   /// Iteratively sink the scalarized operands of a predicated instruction into
610   /// the block that was created for it.
611   void sinkScalarOperands(Instruction *PredInst);
612 
613   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
614   /// represented as.
615   void truncateToMinimalBitwidths(VPTransformState &State);
616 
617   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
618   /// to each vector element of Val. The sequence starts at StartIndex.
619   /// \p Opcode is relevant for FP induction variable.
620   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
621                                Instruction::BinaryOps Opcode =
622                                Instruction::BinaryOpsEnd);
623 
624   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
625   /// variable on which to base the steps, \p Step is the size of the step, and
626   /// \p EntryVal is the value from the original loop that maps to the steps.
627   /// Note that \p EntryVal doesn't have to be an induction variable - it
628   /// can also be a truncate instruction.
629   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
630                         const InductionDescriptor &ID, VPValue *Def,
631                         VPValue *CastDef, VPTransformState &State);
632 
633   /// Create a vector induction phi node based on an existing scalar one. \p
634   /// EntryVal is the value from the original loop that maps to the vector phi
635   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
636   /// truncate instruction, instead of widening the original IV, we widen a
637   /// version of the IV truncated to \p EntryVal's type.
638   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
639                                        Value *Step, Value *Start,
640                                        Instruction *EntryVal, VPValue *Def,
641                                        VPValue *CastDef,
642                                        VPTransformState &State);
643 
644   /// Returns true if an instruction \p I should be scalarized instead of
645   /// vectorized for the chosen vectorization factor.
646   bool shouldScalarizeInstruction(Instruction *I) const;
647 
648   /// Returns true if we should generate a scalar version of \p IV.
649   bool needsScalarInduction(Instruction *IV) const;
650 
651   /// If there is a cast involved in the induction variable \p ID, which should
652   /// be ignored in the vectorized loop body, this function records the
653   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
654   /// cast. We had already proved that the casted Phi is equal to the uncasted
655   /// Phi in the vectorized loop (under a runtime guard), and therefore
656   /// there is no need to vectorize the cast - the same value can be used in the
657   /// vector loop for both the Phi and the cast.
658   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
659   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
660   ///
661   /// \p EntryVal is the value from the original loop that maps to the vector
662   /// phi node and is used to distinguish what is the IV currently being
663   /// processed - original one (if \p EntryVal is a phi corresponding to the
664   /// original IV) or the "newly-created" one based on the proof mentioned above
665   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
666   /// latter case \p EntryVal is a TruncInst and we must not record anything for
667   /// that IV, but it's error-prone to expect callers of this routine to care
668   /// about that, hence this explicit parameter.
669   void recordVectorLoopValueForInductionCast(
670       const InductionDescriptor &ID, const Instruction *EntryVal,
671       Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
672       unsigned Part, unsigned Lane = UINT_MAX);
673 
674   /// Generate a shuffle sequence that will reverse the vector Vec.
675   virtual Value *reverseVector(Value *Vec);
676 
677   /// Returns (and creates if needed) the original loop trip count.
678   Value *getOrCreateTripCount(Loop *NewLoop);
679 
680   /// Returns (and creates if needed) the trip count of the widened loop.
681   Value *getOrCreateVectorTripCount(Loop *NewLoop);
682 
683   /// Returns a bitcasted value to the requested vector type.
684   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
685   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
686                                 const DataLayout &DL);
687 
688   /// Emit a bypass check to see if the vector trip count is zero, including if
689   /// it overflows.
690   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
691 
692   /// Emit a bypass check to see if all of the SCEV assumptions we've
693   /// had to make are correct. Returns the block containing the checks or
694   /// nullptr if no checks have been added.
695   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
696 
697   /// Emit bypass checks to check any memory assumptions we may have made.
698   /// Returns the block containing the checks or nullptr if no checks have been
699   /// added.
700   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
701 
702   /// Compute the transformed value of Index at offset StartValue using step
703   /// StepValue.
704   /// For integer induction, returns StartValue + Index * StepValue.
705   /// For pointer induction, returns StartValue[Index * StepValue].
706   /// FIXME: The newly created binary instructions should contain nsw/nuw
707   /// flags, which can be found from the original scalar operations.
708   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
709                               const DataLayout &DL,
710                               const InductionDescriptor &ID) const;
711 
712   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
713   /// vector loop preheader, middle block and scalar preheader. Also
714   /// allocate a loop object for the new vector loop and return it.
715   Loop *createVectorLoopSkeleton(StringRef Prefix);
716 
717   /// Create new phi nodes for the induction variables to resume iteration count
718   /// in the scalar epilogue, from where the vectorized loop left off (given by
719   /// \p VectorTripCount).
720   /// In cases where the loop skeleton is more complicated (eg. epilogue
721   /// vectorization) and the resume values can come from an additional bypass
722   /// block, the \p AdditionalBypass pair provides information about the bypass
723   /// block and the end value on the edge from bypass to this loop.
724   void createInductionResumeValues(
725       Loop *L, Value *VectorTripCount,
726       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
727 
728   /// Complete the loop skeleton by adding debug MDs, creating appropriate
729   /// conditional branches in the middle block, preparing the builder and
730   /// running the verifier. Take in the vector loop \p L as argument, and return
731   /// the preheader of the completed vector loop.
732   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
733 
734   /// Add additional metadata to \p To that was not present on \p Orig.
735   ///
736   /// Currently this is used to add the noalias annotations based on the
737   /// inserted memchecks.  Use this for instructions that are *cloned* into the
738   /// vector loop.
739   void addNewMetadata(Instruction *To, const Instruction *Orig);
740 
741   /// Add metadata from one instruction to another.
742   ///
743   /// This includes both the original MDs from \p From and additional ones (\see
744   /// addNewMetadata).  Use this for *newly created* instructions in the vector
745   /// loop.
746   void addMetadata(Instruction *To, Instruction *From);
747 
748   /// Similar to the previous function but it adds the metadata to a
749   /// vector of instructions.
750   void addMetadata(ArrayRef<Value *> To, Instruction *From);
751 
752   /// Allow subclasses to override and print debug traces before/after vplan
753   /// execution, when trace information is requested.
754   virtual void printDebugTracesAtStart(){};
755   virtual void printDebugTracesAtEnd(){};
756 
757   /// The original loop.
758   Loop *OrigLoop;
759 
760   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
761   /// dynamic knowledge to simplify SCEV expressions and converts them to a
762   /// more usable form.
763   PredicatedScalarEvolution &PSE;
764 
765   /// Loop Info.
766   LoopInfo *LI;
767 
768   /// Dominator Tree.
769   DominatorTree *DT;
770 
771   /// Alias Analysis.
772   AAResults *AA;
773 
774   /// Target Library Info.
775   const TargetLibraryInfo *TLI;
776 
777   /// Target Transform Info.
778   const TargetTransformInfo *TTI;
779 
780   /// Assumption Cache.
781   AssumptionCache *AC;
782 
783   /// Interface to emit optimization remarks.
784   OptimizationRemarkEmitter *ORE;
785 
786   /// LoopVersioning.  It's only set up (non-null) if memchecks were
787   /// used.
788   ///
789   /// This is currently only used to add no-alias metadata based on the
790   /// memchecks.  The actually versioning is performed manually.
791   std::unique_ptr<LoopVersioning> LVer;
792 
793   /// The vectorization SIMD factor to use. Each vector will have this many
794   /// vector elements.
795   ElementCount VF;
796 
797   /// The vectorization unroll factor to use. Each scalar is vectorized to this
798   /// many different vector instructions.
799   unsigned UF;
800 
801   /// The builder that we use
802   IRBuilder<> Builder;
803 
804   // --- Vectorization state ---
805 
806   /// The vector-loop preheader.
807   BasicBlock *LoopVectorPreHeader;
808 
809   /// The scalar-loop preheader.
810   BasicBlock *LoopScalarPreHeader;
811 
812   /// Middle Block between the vector and the scalar.
813   BasicBlock *LoopMiddleBlock;
814 
815   /// The (unique) ExitBlock of the scalar loop.  Note that
816   /// there can be multiple exiting edges reaching this block.
817   BasicBlock *LoopExitBlock;
818 
819   /// The vector loop body.
820   BasicBlock *LoopVectorBody;
821 
822   /// The scalar loop body.
823   BasicBlock *LoopScalarBody;
824 
825   /// A list of all bypass blocks. The first block is the entry of the loop.
826   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
827 
828   /// The new Induction variable which was added to the new block.
829   PHINode *Induction = nullptr;
830 
831   /// The induction variable of the old basic block.
832   PHINode *OldInduction = nullptr;
833 
834   /// Store instructions that were predicated.
835   SmallVector<Instruction *, 4> PredicatedInstructions;
836 
837   /// Trip count of the original loop.
838   Value *TripCount = nullptr;
839 
840   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
841   Value *VectorTripCount = nullptr;
842 
843   /// The legality analysis.
844   LoopVectorizationLegality *Legal;
845 
846   /// The profitablity analysis.
847   LoopVectorizationCostModel *Cost;
848 
849   // Record whether runtime checks are added.
850   bool AddedSafetyChecks = false;
851 
852   // Holds the end values for each induction variable. We save the end values
853   // so we can later fix-up the external users of the induction variables.
854   DenseMap<PHINode *, Value *> IVEndValues;
855 
856   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
857   // fixed up at the end of vector code generation.
858   SmallVector<PHINode *, 8> OrigPHIsToFix;
859 
860   /// BFI and PSI are used to check for profile guided size optimizations.
861   BlockFrequencyInfo *BFI;
862   ProfileSummaryInfo *PSI;
863 
864   // Whether this loop should be optimized for size based on profile guided size
865   // optimizatios.
866   bool OptForSizeBasedOnProfile;
867 
868   /// Structure to hold information about generated runtime checks, responsible
869   /// for cleaning the checks, if vectorization turns out unprofitable.
870   GeneratedRTChecks &RTChecks;
871 };
872 
873 class InnerLoopUnroller : public InnerLoopVectorizer {
874 public:
875   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
876                     LoopInfo *LI, DominatorTree *DT,
877                     const TargetLibraryInfo *TLI,
878                     const TargetTransformInfo *TTI, AssumptionCache *AC,
879                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
880                     LoopVectorizationLegality *LVL,
881                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
882                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
883       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
884                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
885                             BFI, PSI, Check) {}
886 
887 private:
888   Value *getBroadcastInstrs(Value *V) override;
889   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
890                        Instruction::BinaryOps Opcode =
891                        Instruction::BinaryOpsEnd) override;
892   Value *reverseVector(Value *Vec) override;
893 };
894 
895 /// Encapsulate information regarding vectorization of a loop and its epilogue.
896 /// This information is meant to be updated and used across two stages of
897 /// epilogue vectorization.
898 struct EpilogueLoopVectorizationInfo {
899   ElementCount MainLoopVF = ElementCount::getFixed(0);
900   unsigned MainLoopUF = 0;
901   ElementCount EpilogueVF = ElementCount::getFixed(0);
902   unsigned EpilogueUF = 0;
903   BasicBlock *MainLoopIterationCountCheck = nullptr;
904   BasicBlock *EpilogueIterationCountCheck = nullptr;
905   BasicBlock *SCEVSafetyCheck = nullptr;
906   BasicBlock *MemSafetyCheck = nullptr;
907   Value *TripCount = nullptr;
908   Value *VectorTripCount = nullptr;
909 
910   EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
911                                 unsigned EUF)
912       : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
913         EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
914     assert(EUF == 1 &&
915            "A high UF for the epilogue loop is likely not beneficial.");
916   }
917 };
918 
919 /// An extension of the inner loop vectorizer that creates a skeleton for a
920 /// vectorized loop that has its epilogue (residual) also vectorized.
921 /// The idea is to run the vplan on a given loop twice, firstly to setup the
922 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
923 /// from the first step and vectorize the epilogue.  This is achieved by
924 /// deriving two concrete strategy classes from this base class and invoking
925 /// them in succession from the loop vectorizer planner.
926 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
927 public:
928   InnerLoopAndEpilogueVectorizer(
929       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
930       DominatorTree *DT, const TargetLibraryInfo *TLI,
931       const TargetTransformInfo *TTI, AssumptionCache *AC,
932       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
933       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
934       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
935       GeneratedRTChecks &Checks)
936       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
937                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
938                             Checks),
939         EPI(EPI) {}
940 
941   // Override this function to handle the more complex control flow around the
942   // three loops.
943   BasicBlock *createVectorizedLoopSkeleton() final override {
944     return createEpilogueVectorizedLoopSkeleton();
945   }
946 
947   /// The interface for creating a vectorized skeleton using one of two
948   /// different strategies, each corresponding to one execution of the vplan
949   /// as described above.
950   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
951 
952   /// Holds and updates state information required to vectorize the main loop
953   /// and its epilogue in two separate passes. This setup helps us avoid
954   /// regenerating and recomputing runtime safety checks. It also helps us to
955   /// shorten the iteration-count-check path length for the cases where the
956   /// iteration count of the loop is so small that the main vector loop is
957   /// completely skipped.
958   EpilogueLoopVectorizationInfo &EPI;
959 };
960 
961 /// A specialized derived class of inner loop vectorizer that performs
962 /// vectorization of *main* loops in the process of vectorizing loops and their
963 /// epilogues.
964 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
965 public:
966   EpilogueVectorizerMainLoop(
967       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
968       DominatorTree *DT, const TargetLibraryInfo *TLI,
969       const TargetTransformInfo *TTI, AssumptionCache *AC,
970       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
971       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
972       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
973       GeneratedRTChecks &Check)
974       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
975                                        EPI, LVL, CM, BFI, PSI, Check) {}
976   /// Implements the interface for creating a vectorized skeleton using the
977   /// *main loop* strategy (ie the first pass of vplan execution).
978   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
979 
980 protected:
981   /// Emits an iteration count bypass check once for the main loop (when \p
982   /// ForEpilogue is false) and once for the epilogue loop (when \p
983   /// ForEpilogue is true).
984   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
985                                              bool ForEpilogue);
986   void printDebugTracesAtStart() override;
987   void printDebugTracesAtEnd() override;
988 };
989 
990 // A specialized derived class of inner loop vectorizer that performs
991 // vectorization of *epilogue* loops in the process of vectorizing loops and
992 // their epilogues.
993 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
994 public:
995   EpilogueVectorizerEpilogueLoop(
996       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
997       DominatorTree *DT, const TargetLibraryInfo *TLI,
998       const TargetTransformInfo *TTI, AssumptionCache *AC,
999       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
1000       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
1001       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
1002       GeneratedRTChecks &Checks)
1003       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1004                                        EPI, LVL, CM, BFI, PSI, Checks) {}
1005   /// Implements the interface for creating a vectorized skeleton using the
1006   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1007   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1008 
1009 protected:
1010   /// Emits an iteration count bypass check after the main vector loop has
1011   /// finished to see if there are any iterations left to execute by either
1012   /// the vector epilogue or the scalar epilogue.
1013   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1014                                                       BasicBlock *Bypass,
1015                                                       BasicBlock *Insert);
1016   void printDebugTracesAtStart() override;
1017   void printDebugTracesAtEnd() override;
1018 };
1019 } // end namespace llvm
1020 
1021 /// Look for a meaningful debug location on the instruction or it's
1022 /// operands.
1023 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1024   if (!I)
1025     return I;
1026 
1027   DebugLoc Empty;
1028   if (I->getDebugLoc() != Empty)
1029     return I;
1030 
1031   for (Use &Op : I->operands()) {
1032     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1033       if (OpInst->getDebugLoc() != Empty)
1034         return OpInst;
1035   }
1036 
1037   return I;
1038 }
1039 
1040 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1041   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1042     const DILocation *DIL = Inst->getDebugLoc();
1043     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1044         !isa<DbgInfoIntrinsic>(Inst)) {
1045       assert(!VF.isScalable() && "scalable vectors not yet supported.");
1046       auto NewDIL =
1047           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1048       if (NewDIL)
1049         B.SetCurrentDebugLocation(NewDIL.getValue());
1050       else
1051         LLVM_DEBUG(dbgs()
1052                    << "Failed to create new discriminator: "
1053                    << DIL->getFilename() << " Line: " << DIL->getLine());
1054     }
1055     else
1056       B.SetCurrentDebugLocation(DIL);
1057   } else
1058     B.SetCurrentDebugLocation(DebugLoc());
1059 }
1060 
1061 /// Write a record \p DebugMsg about vectorization failure to the debug
1062 /// output stream. If \p I is passed, it is an instruction that prevents
1063 /// vectorization.
1064 #ifndef NDEBUG
1065 static void debugVectorizationFailure(const StringRef DebugMsg,
1066     Instruction *I) {
1067   dbgs() << "LV: Not vectorizing: " << DebugMsg;
1068   if (I != nullptr)
1069     dbgs() << " " << *I;
1070   else
1071     dbgs() << '.';
1072   dbgs() << '\n';
1073 }
1074 #endif
1075 
1076 /// Create an analysis remark that explains why vectorization failed
1077 ///
1078 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1079 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1080 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1081 /// the location of the remark.  \return the remark object that can be
1082 /// streamed to.
1083 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1084     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1085   Value *CodeRegion = TheLoop->getHeader();
1086   DebugLoc DL = TheLoop->getStartLoc();
1087 
1088   if (I) {
1089     CodeRegion = I->getParent();
1090     // If there is no debug location attached to the instruction, revert back to
1091     // using the loop's.
1092     if (I->getDebugLoc())
1093       DL = I->getDebugLoc();
1094   }
1095 
1096   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
1097   R << "loop not vectorized: ";
1098   return R;
1099 }
1100 
1101 /// Return a value for Step multiplied by VF.
1102 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1103   assert(isa<ConstantInt>(Step) && "Expected an integer step");
1104   Constant *StepVal = ConstantInt::get(
1105       Step->getType(),
1106       cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1107   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1108 }
1109 
1110 namespace llvm {
1111 
1112 /// Return the runtime value for VF.
1113 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1114   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1115   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1116 }
1117 
1118 void reportVectorizationFailure(const StringRef DebugMsg,
1119     const StringRef OREMsg, const StringRef ORETag,
1120     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
1121   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
1122   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1123   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
1124                 ORETag, TheLoop, I) << OREMsg);
1125 }
1126 
1127 } // end namespace llvm
1128 
1129 #ifndef NDEBUG
1130 /// \return string containing a file name and a line # for the given loop.
1131 static std::string getDebugLocString(const Loop *L) {
1132   std::string Result;
1133   if (L) {
1134     raw_string_ostream OS(Result);
1135     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1136       LoopDbgLoc.print(OS);
1137     else
1138       // Just print the module name.
1139       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1140     OS.flush();
1141   }
1142   return Result;
1143 }
1144 #endif
1145 
1146 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1147                                          const Instruction *Orig) {
1148   // If the loop was versioned with memchecks, add the corresponding no-alias
1149   // metadata.
1150   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1151     LVer->annotateInstWithNoAlias(To, Orig);
1152 }
1153 
1154 void InnerLoopVectorizer::addMetadata(Instruction *To,
1155                                       Instruction *From) {
1156   propagateMetadata(To, From);
1157   addNewMetadata(To, From);
1158 }
1159 
1160 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1161                                       Instruction *From) {
1162   for (Value *V : To) {
1163     if (Instruction *I = dyn_cast<Instruction>(V))
1164       addMetadata(I, From);
1165   }
1166 }
1167 
1168 namespace llvm {
1169 
1170 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1171 // lowered.
1172 enum ScalarEpilogueLowering {
1173 
1174   // The default: allowing scalar epilogues.
1175   CM_ScalarEpilogueAllowed,
1176 
1177   // Vectorization with OptForSize: don't allow epilogues.
1178   CM_ScalarEpilogueNotAllowedOptSize,
1179 
1180   // A special case of vectorisation with OptForSize: loops with a very small
1181   // trip count are considered for vectorization under OptForSize, thereby
1182   // making sure the cost of their loop body is dominant, free of runtime
1183   // guards and scalar iteration overheads.
1184   CM_ScalarEpilogueNotAllowedLowTripLoop,
1185 
1186   // Loop hint predicate indicating an epilogue is undesired.
1187   CM_ScalarEpilogueNotNeededUsePredicate,
1188 
1189   // Directive indicating we must either tail fold or not vectorize
1190   CM_ScalarEpilogueNotAllowedUsePredicate
1191 };
1192 
1193 /// LoopVectorizationCostModel - estimates the expected speedups due to
1194 /// vectorization.
1195 /// In many cases vectorization is not profitable. This can happen because of
1196 /// a number of reasons. In this class we mainly attempt to predict the
1197 /// expected speedup/slowdowns due to the supported instruction set. We use the
1198 /// TargetTransformInfo to query the different backends for the cost of
1199 /// different operations.
1200 class LoopVectorizationCostModel {
1201 public:
1202   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1203                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1204                              LoopVectorizationLegality *Legal,
1205                              const TargetTransformInfo &TTI,
1206                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1207                              AssumptionCache *AC,
1208                              OptimizationRemarkEmitter *ORE, const Function *F,
1209                              const LoopVectorizeHints *Hints,
1210                              InterleavedAccessInfo &IAI)
1211       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1212         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1213         Hints(Hints), InterleaveInfo(IAI) {}
1214 
1215   /// \return An upper bound for the vectorization factor, or None if
1216   /// vectorization and interleaving should be avoided up front.
1217   Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
1218 
1219   /// \return True if runtime checks are required for vectorization, and false
1220   /// otherwise.
1221   bool runtimeChecksRequired();
1222 
1223   /// \return The most profitable vectorization factor and the cost of that VF.
1224   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
1225   /// then this vectorization factor will be selected if vectorization is
1226   /// possible.
1227   VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
1228   VectorizationFactor
1229   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1230                                     const LoopVectorizationPlanner &LVP);
1231 
1232   /// Setup cost-based decisions for user vectorization factor.
1233   void selectUserVectorizationFactor(ElementCount UserVF) {
1234     collectUniformsAndScalars(UserVF);
1235     collectInstsToScalarize(UserVF);
1236   }
1237 
1238   /// \return The size (in bits) of the smallest and widest types in the code
1239   /// that needs to be vectorized. We ignore values that remain scalar such as
1240   /// 64 bit loop indices.
1241   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1242 
1243   /// \return The desired interleave count.
1244   /// If interleave count has been specified by metadata it will be returned.
1245   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1246   /// are the selected vectorization factor and the cost of the selected VF.
1247   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1248 
1249   /// Memory access instruction may be vectorized in more than one way.
1250   /// Form of instruction after vectorization depends on cost.
1251   /// This function takes cost-based decisions for Load/Store instructions
1252   /// and collects them in a map. This decisions map is used for building
1253   /// the lists of loop-uniform and loop-scalar instructions.
1254   /// The calculated cost is saved with widening decision in order to
1255   /// avoid redundant calculations.
1256   void setCostBasedWideningDecision(ElementCount VF);
1257 
1258   /// A struct that represents some properties of the register usage
1259   /// of a loop.
1260   struct RegisterUsage {
1261     /// Holds the number of loop invariant values that are used in the loop.
1262     /// The key is ClassID of target-provided register class.
1263     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1264     /// Holds the maximum number of concurrent live intervals in the loop.
1265     /// The key is ClassID of target-provided register class.
1266     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1267   };
1268 
1269   /// \return Returns information about the register usages of the loop for the
1270   /// given vectorization factors.
1271   SmallVector<RegisterUsage, 8>
1272   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1273 
1274   /// Collect values we want to ignore in the cost model.
1275   void collectValuesToIgnore();
1276 
1277   /// Split reductions into those that happen in the loop, and those that happen
1278   /// outside. In loop reductions are collected into InLoopReductionChains.
1279   void collectInLoopReductions();
1280 
1281   /// \returns The smallest bitwidth each instruction can be represented with.
1282   /// The vector equivalents of these instructions should be truncated to this
1283   /// type.
1284   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1285     return MinBWs;
1286   }
1287 
1288   /// \returns True if it is more profitable to scalarize instruction \p I for
1289   /// vectorization factor \p VF.
1290   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1291     assert(VF.isVector() &&
1292            "Profitable to scalarize relevant only for VF > 1.");
1293 
1294     // Cost model is not run in the VPlan-native path - return conservative
1295     // result until this changes.
1296     if (EnableVPlanNativePath)
1297       return false;
1298 
1299     auto Scalars = InstsToScalarize.find(VF);
1300     assert(Scalars != InstsToScalarize.end() &&
1301            "VF not yet analyzed for scalarization profitability");
1302     return Scalars->second.find(I) != Scalars->second.end();
1303   }
1304 
1305   /// Returns true if \p I is known to be uniform after vectorization.
1306   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1307     if (VF.isScalar())
1308       return true;
1309 
1310     // Cost model is not run in the VPlan-native path - return conservative
1311     // result until this changes.
1312     if (EnableVPlanNativePath)
1313       return false;
1314 
1315     auto UniformsPerVF = Uniforms.find(VF);
1316     assert(UniformsPerVF != Uniforms.end() &&
1317            "VF not yet analyzed for uniformity");
1318     return UniformsPerVF->second.count(I);
1319   }
1320 
1321   /// Returns true if \p I is known to be scalar after vectorization.
1322   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1323     if (VF.isScalar())
1324       return true;
1325 
1326     // Cost model is not run in the VPlan-native path - return conservative
1327     // result until this changes.
1328     if (EnableVPlanNativePath)
1329       return false;
1330 
1331     auto ScalarsPerVF = Scalars.find(VF);
1332     assert(ScalarsPerVF != Scalars.end() &&
1333            "Scalar values are not calculated for VF");
1334     return ScalarsPerVF->second.count(I);
1335   }
1336 
1337   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1338   /// for vectorization factor \p VF.
1339   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1340     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1341            !isProfitableToScalarize(I, VF) &&
1342            !isScalarAfterVectorization(I, VF);
1343   }
1344 
1345   /// Decision that was taken during cost calculation for memory instruction.
1346   enum InstWidening {
1347     CM_Unknown,
1348     CM_Widen,         // For consecutive accesses with stride +1.
1349     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1350     CM_Interleave,
1351     CM_GatherScatter,
1352     CM_Scalarize
1353   };
1354 
1355   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1356   /// instruction \p I and vector width \p VF.
1357   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1358                            InstructionCost Cost) {
1359     assert(VF.isVector() && "Expected VF >=2");
1360     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1361   }
1362 
1363   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1364   /// interleaving group \p Grp and vector width \p VF.
1365   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1366                            ElementCount VF, InstWidening W,
1367                            InstructionCost Cost) {
1368     assert(VF.isVector() && "Expected VF >=2");
1369     /// Broadcast this decicion to all instructions inside the group.
1370     /// But the cost will be assigned to one instruction only.
1371     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1372       if (auto *I = Grp->getMember(i)) {
1373         if (Grp->getInsertPos() == I)
1374           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1375         else
1376           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1377       }
1378     }
1379   }
1380 
1381   /// Return the cost model decision for the given instruction \p I and vector
1382   /// width \p VF. Return CM_Unknown if this instruction did not pass
1383   /// through the cost modeling.
1384   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
1385     assert(VF.isVector() && "Expected VF to be a vector VF");
1386     // Cost model is not run in the VPlan-native path - return conservative
1387     // result until this changes.
1388     if (EnableVPlanNativePath)
1389       return CM_GatherScatter;
1390 
1391     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1392     auto Itr = WideningDecisions.find(InstOnVF);
1393     if (Itr == WideningDecisions.end())
1394       return CM_Unknown;
1395     return Itr->second.first;
1396   }
1397 
1398   /// Return the vectorization cost for the given instruction \p I and vector
1399   /// width \p VF.
1400   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1401     assert(VF.isVector() && "Expected VF >=2");
1402     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1403     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1404            "The cost is not calculated");
1405     return WideningDecisions[InstOnVF].second;
1406   }
1407 
1408   /// Return True if instruction \p I is an optimizable truncate whose operand
1409   /// is an induction variable. Such a truncate will be removed by adding a new
1410   /// induction variable with the destination type.
1411   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1412     // If the instruction is not a truncate, return false.
1413     auto *Trunc = dyn_cast<TruncInst>(I);
1414     if (!Trunc)
1415       return false;
1416 
1417     // Get the source and destination types of the truncate.
1418     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1419     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1420 
1421     // If the truncate is free for the given types, return false. Replacing a
1422     // free truncate with an induction variable would add an induction variable
1423     // update instruction to each iteration of the loop. We exclude from this
1424     // check the primary induction variable since it will need an update
1425     // instruction regardless.
1426     Value *Op = Trunc->getOperand(0);
1427     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1428       return false;
1429 
1430     // If the truncated value is not an induction variable, return false.
1431     return Legal->isInductionPhi(Op);
1432   }
1433 
1434   /// Collects the instructions to scalarize for each predicated instruction in
1435   /// the loop.
1436   void collectInstsToScalarize(ElementCount VF);
1437 
1438   /// Collect Uniform and Scalar values for the given \p VF.
1439   /// The sets depend on CM decision for Load/Store instructions
1440   /// that may be vectorized as interleave, gather-scatter or scalarized.
1441   void collectUniformsAndScalars(ElementCount VF) {
1442     // Do the analysis once.
1443     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1444       return;
1445     setCostBasedWideningDecision(VF);
1446     collectLoopUniforms(VF);
1447     collectLoopScalars(VF);
1448   }
1449 
1450   /// Returns true if the target machine supports masked store operation
1451   /// for the given \p DataType and kind of access to \p Ptr.
1452   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
1453     return Legal->isConsecutivePtr(Ptr) &&
1454            TTI.isLegalMaskedStore(DataType, Alignment);
1455   }
1456 
1457   /// Returns true if the target machine supports masked load operation
1458   /// for the given \p DataType and kind of access to \p Ptr.
1459   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
1460     return Legal->isConsecutivePtr(Ptr) &&
1461            TTI.isLegalMaskedLoad(DataType, Alignment);
1462   }
1463 
1464   /// Returns true if the target machine supports masked scatter operation
1465   /// for the given \p DataType.
1466   bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
1467     return TTI.isLegalMaskedScatter(DataType, Alignment);
1468   }
1469 
1470   /// Returns true if the target machine supports masked gather operation
1471   /// for the given \p DataType.
1472   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
1473     return TTI.isLegalMaskedGather(DataType, Alignment);
1474   }
1475 
1476   /// Returns true if the target machine can represent \p V as a masked gather
1477   /// or scatter operation.
1478   bool isLegalGatherOrScatter(Value *V) {
1479     bool LI = isa<LoadInst>(V);
1480     bool SI = isa<StoreInst>(V);
1481     if (!LI && !SI)
1482       return false;
1483     auto *Ty = getMemInstValueType(V);
1484     Align Align = getLoadStoreAlignment(V);
1485     return (LI && isLegalMaskedGather(Ty, Align)) ||
1486            (SI && isLegalMaskedScatter(Ty, Align));
1487   }
1488 
1489   /// Returns true if the target machine supports all of the reduction
1490   /// variables found for the given VF.
1491   bool canVectorizeReductions(ElementCount VF) {
1492     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1493       RecurrenceDescriptor RdxDesc = Reduction.second;
1494       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1495     }));
1496   }
1497 
1498   /// Returns true if \p I is an instruction that will be scalarized with
1499   /// predication. Such instructions include conditional stores and
1500   /// instructions that may divide by zero.
1501   /// If a non-zero VF has been calculated, we check if I will be scalarized
1502   /// predication for that VF.
1503   bool isScalarWithPredication(Instruction *I,
1504                                ElementCount VF = ElementCount::getFixed(1));
1505 
1506   // Returns true if \p I is an instruction that will be predicated either
1507   // through scalar predication or masked load/store or masked gather/scatter.
1508   // Superset of instructions that return true for isScalarWithPredication.
1509   bool isPredicatedInst(Instruction *I) {
1510     if (!blockNeedsPredication(I->getParent()))
1511       return false;
1512     // Loads and stores that need some form of masked operation are predicated
1513     // instructions.
1514     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1515       return Legal->isMaskRequired(I);
1516     return isScalarWithPredication(I);
1517   }
1518 
1519   /// Returns true if \p I is a memory instruction with consecutive memory
1520   /// access that can be widened.
1521   bool
1522   memoryInstructionCanBeWidened(Instruction *I,
1523                                 ElementCount VF = ElementCount::getFixed(1));
1524 
1525   /// Returns true if \p I is a memory instruction in an interleaved-group
1526   /// of memory accesses that can be vectorized with wide vector loads/stores
1527   /// and shuffles.
1528   bool
1529   interleavedAccessCanBeWidened(Instruction *I,
1530                                 ElementCount VF = ElementCount::getFixed(1));
1531 
1532   /// Check if \p Instr belongs to any interleaved access group.
1533   bool isAccessInterleaved(Instruction *Instr) {
1534     return InterleaveInfo.isInterleaved(Instr);
1535   }
1536 
1537   /// Get the interleaved access group that \p Instr belongs to.
1538   const InterleaveGroup<Instruction> *
1539   getInterleavedAccessGroup(Instruction *Instr) {
1540     return InterleaveInfo.getInterleaveGroup(Instr);
1541   }
1542 
1543   /// Returns true if we're required to use a scalar epilogue for at least
1544   /// the final iteration of the original loop.
1545   bool requiresScalarEpilogue() const {
1546     if (!isScalarEpilogueAllowed())
1547       return false;
1548     // If we might exit from anywhere but the latch, must run the exiting
1549     // iteration in scalar form.
1550     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1551       return true;
1552     return InterleaveInfo.requiresScalarEpilogue();
1553   }
1554 
1555   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1556   /// loop hint annotation.
1557   bool isScalarEpilogueAllowed() const {
1558     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1559   }
1560 
1561   /// Returns true if all loop blocks should be masked to fold tail loop.
1562   bool foldTailByMasking() const { return FoldTailByMasking; }
1563 
1564   bool blockNeedsPredication(BasicBlock *BB) {
1565     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1566   }
1567 
1568   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1569   /// nodes to the chain of instructions representing the reductions. Uses a
1570   /// MapVector to ensure deterministic iteration order.
1571   using ReductionChainMap =
1572       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1573 
1574   /// Return the chain of instructions representing an inloop reduction.
1575   const ReductionChainMap &getInLoopReductionChains() const {
1576     return InLoopReductionChains;
1577   }
1578 
1579   /// Returns true if the Phi is part of an inloop reduction.
1580   bool isInLoopReduction(PHINode *Phi) const {
1581     return InLoopReductionChains.count(Phi);
1582   }
1583 
1584   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1585   /// with factor VF.  Return the cost of the instruction, including
1586   /// scalarization overhead if it's needed.
1587   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
1588 
1589   /// Estimate cost of a call instruction CI if it were vectorized with factor
1590   /// VF. Return the cost of the instruction, including scalarization overhead
1591   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1592   /// scalarized -
1593   /// i.e. either vector version isn't available, or is too expensive.
1594   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1595                                     bool &NeedToScalarize);
1596 
1597   /// Invalidates decisions already taken by the cost model.
1598   void invalidateCostModelingDecisions() {
1599     WideningDecisions.clear();
1600     Uniforms.clear();
1601     Scalars.clear();
1602   }
1603 
1604 private:
1605   unsigned NumPredStores = 0;
1606 
1607   /// \return An upper bound for the vectorization factor, a power-of-2 larger
1608   /// than zero. One is returned if vectorization should best be avoided due
1609   /// to cost.
1610   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
1611                                     ElementCount UserVF);
1612 
1613   /// The vectorization cost is a combination of the cost itself and a boolean
1614   /// indicating whether any of the contributing operations will actually
1615   /// operate on
1616   /// vector values after type legalization in the backend. If this latter value
1617   /// is
1618   /// false, then all operations will be scalarized (i.e. no vectorization has
1619   /// actually taken place).
1620   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1621 
1622   /// Returns the expected execution cost. The unit of the cost does
1623   /// not matter because we use the 'cost' units to compare different
1624   /// vector widths. The cost that is returned is *not* normalized by
1625   /// the factor width.
1626   VectorizationCostTy expectedCost(ElementCount VF);
1627 
1628   /// Returns the execution time cost of an instruction for a given vector
1629   /// width. Vector width of one means scalar.
1630   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1631 
1632   /// The cost-computation logic from getInstructionCost which provides
1633   /// the vector type as an output parameter.
1634   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1635                                      Type *&VectorTy);
1636 
1637   /// Return the cost of instructions in an inloop reduction pattern, if I is
1638   /// part of that pattern.
1639   InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
1640                                           Type *VectorTy,
1641                                           TTI::TargetCostKind CostKind);
1642 
1643   /// Calculate vectorization cost of memory instruction \p I.
1644   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1645 
1646   /// The cost computation for scalarized memory instruction.
1647   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1648 
1649   /// The cost computation for interleaving group of memory instructions.
1650   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1651 
1652   /// The cost computation for Gather/Scatter instruction.
1653   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1654 
1655   /// The cost computation for widening instruction \p I with consecutive
1656   /// memory access.
1657   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1658 
1659   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1660   /// Load: scalar load + broadcast.
1661   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1662   /// element)
1663   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1664 
1665   /// Estimate the overhead of scalarizing an instruction. This is a
1666   /// convenience wrapper for the type-based getScalarizationOverhead API.
1667   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF);
1668 
1669   /// Returns whether the instruction is a load or store and will be a emitted
1670   /// as a vector operation.
1671   bool isConsecutiveLoadOrStore(Instruction *I);
1672 
1673   /// Returns true if an artificially high cost for emulated masked memrefs
1674   /// should be used.
1675   bool useEmulatedMaskMemRefHack(Instruction *I);
1676 
1677   /// Map of scalar integer values to the smallest bitwidth they can be legally
1678   /// represented as. The vector equivalents of these values should be truncated
1679   /// to this type.
1680   MapVector<Instruction *, uint64_t> MinBWs;
1681 
1682   /// A type representing the costs for instructions if they were to be
1683   /// scalarized rather than vectorized. The entries are Instruction-Cost
1684   /// pairs.
1685   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1686 
1687   /// A set containing all BasicBlocks that are known to present after
1688   /// vectorization as a predicated block.
1689   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1690 
1691   /// Records whether it is allowed to have the original scalar loop execute at
1692   /// least once. This may be needed as a fallback loop in case runtime
1693   /// aliasing/dependence checks fail, or to handle the tail/remainder
1694   /// iterations when the trip count is unknown or doesn't divide by the VF,
1695   /// or as a peel-loop to handle gaps in interleave-groups.
1696   /// Under optsize and when the trip count is very small we don't allow any
1697   /// iterations to execute in the scalar loop.
1698   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1699 
1700   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1701   bool FoldTailByMasking = false;
1702 
1703   /// A map holding scalar costs for different vectorization factors. The
1704   /// presence of a cost for an instruction in the mapping indicates that the
1705   /// instruction will be scalarized when vectorizing with the associated
1706   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1707   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1708 
1709   /// Holds the instructions known to be uniform after vectorization.
1710   /// The data is collected per VF.
1711   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1712 
1713   /// Holds the instructions known to be scalar after vectorization.
1714   /// The data is collected per VF.
1715   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1716 
1717   /// Holds the instructions (address computations) that are forced to be
1718   /// scalarized.
1719   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1720 
1721   /// PHINodes of the reductions that should be expanded in-loop along with
1722   /// their associated chains of reduction operations, in program order from top
1723   /// (PHI) to bottom
1724   ReductionChainMap InLoopReductionChains;
1725 
1726   /// A Map of inloop reduction operations and their immediate chain operand.
1727   /// FIXME: This can be removed once reductions can be costed correctly in
1728   /// vplan. This was added to allow quick lookup to the inloop operations,
1729   /// without having to loop through InLoopReductionChains.
1730   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1731 
1732   /// Returns the expected difference in cost from scalarizing the expression
1733   /// feeding a predicated instruction \p PredInst. The instructions to
1734   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1735   /// non-negative return value implies the expression will be scalarized.
1736   /// Currently, only single-use chains are considered for scalarization.
1737   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1738                               ElementCount VF);
1739 
1740   /// Collect the instructions that are uniform after vectorization. An
1741   /// instruction is uniform if we represent it with a single scalar value in
1742   /// the vectorized loop corresponding to each vector iteration. Examples of
1743   /// uniform instructions include pointer operands of consecutive or
1744   /// interleaved memory accesses. Note that although uniformity implies an
1745   /// instruction will be scalar, the reverse is not true. In general, a
1746   /// scalarized instruction will be represented by VF scalar values in the
1747   /// vectorized loop, each corresponding to an iteration of the original
1748   /// scalar loop.
1749   void collectLoopUniforms(ElementCount VF);
1750 
1751   /// Collect the instructions that are scalar after vectorization. An
1752   /// instruction is scalar if it is known to be uniform or will be scalarized
1753   /// during vectorization. Non-uniform scalarized instructions will be
1754   /// represented by VF values in the vectorized loop, each corresponding to an
1755   /// iteration of the original scalar loop.
1756   void collectLoopScalars(ElementCount VF);
1757 
1758   /// Keeps cost model vectorization decision and cost for instructions.
1759   /// Right now it is used for memory instructions only.
1760   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1761                                 std::pair<InstWidening, InstructionCost>>;
1762 
1763   DecisionList WideningDecisions;
1764 
1765   /// Returns true if \p V is expected to be vectorized and it needs to be
1766   /// extracted.
1767   bool needsExtract(Value *V, ElementCount VF) const {
1768     Instruction *I = dyn_cast<Instruction>(V);
1769     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1770         TheLoop->isLoopInvariant(I))
1771       return false;
1772 
1773     // Assume we can vectorize V (and hence we need extraction) if the
1774     // scalars are not computed yet. This can happen, because it is called
1775     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1776     // the scalars are collected. That should be a safe assumption in most
1777     // cases, because we check if the operands have vectorizable types
1778     // beforehand in LoopVectorizationLegality.
1779     return Scalars.find(VF) == Scalars.end() ||
1780            !isScalarAfterVectorization(I, VF);
1781   };
1782 
1783   /// Returns a range containing only operands needing to be extracted.
1784   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1785                                                    ElementCount VF) {
1786     return SmallVector<Value *, 4>(make_filter_range(
1787         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1788   }
1789 
1790   /// Determines if we have the infrastructure to vectorize loop \p L and its
1791   /// epilogue, assuming the main loop is vectorized by \p VF.
1792   bool isCandidateForEpilogueVectorization(const Loop &L,
1793                                            const ElementCount VF) const;
1794 
1795   /// Returns true if epilogue vectorization is considered profitable, and
1796   /// false otherwise.
1797   /// \p VF is the vectorization factor chosen for the original loop.
1798   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1799 
1800 public:
1801   /// The loop that we evaluate.
1802   Loop *TheLoop;
1803 
1804   /// Predicated scalar evolution analysis.
1805   PredicatedScalarEvolution &PSE;
1806 
1807   /// Loop Info analysis.
1808   LoopInfo *LI;
1809 
1810   /// Vectorization legality.
1811   LoopVectorizationLegality *Legal;
1812 
1813   /// Vector target information.
1814   const TargetTransformInfo &TTI;
1815 
1816   /// Target Library Info.
1817   const TargetLibraryInfo *TLI;
1818 
1819   /// Demanded bits analysis.
1820   DemandedBits *DB;
1821 
1822   /// Assumption cache.
1823   AssumptionCache *AC;
1824 
1825   /// Interface to emit optimization remarks.
1826   OptimizationRemarkEmitter *ORE;
1827 
1828   const Function *TheFunction;
1829 
1830   /// Loop Vectorize Hint.
1831   const LoopVectorizeHints *Hints;
1832 
1833   /// The interleave access information contains groups of interleaved accesses
1834   /// with the same stride and close to each other.
1835   InterleavedAccessInfo &InterleaveInfo;
1836 
1837   /// Values to ignore in the cost model.
1838   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1839 
1840   /// Values to ignore in the cost model when VF > 1.
1841   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1842 
1843   /// Profitable vector factors.
1844   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1845 };
1846 } // end namespace llvm
1847 
1848 /// Helper struct to manage generating runtime checks for vectorization.
1849 ///
1850 /// The runtime checks are created up-front in temporary blocks to allow better
1851 /// estimating the cost and un-linked from the existing IR. After deciding to
1852 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1853 /// temporary blocks are completely removed.
1854 class GeneratedRTChecks {
1855   /// Basic block which contains the generated SCEV checks, if any.
1856   BasicBlock *SCEVCheckBlock = nullptr;
1857 
1858   /// The value representing the result of the generated SCEV checks. If it is
1859   /// nullptr, either no SCEV checks have been generated or they have been used.
1860   Value *SCEVCheckCond = nullptr;
1861 
1862   /// Basic block which contains the generated memory runtime checks, if any.
1863   BasicBlock *MemCheckBlock = nullptr;
1864 
1865   /// The value representing the result of the generated memory runtime checks.
1866   /// If it is nullptr, either no memory runtime checks have been generated or
1867   /// they have been used.
1868   Instruction *MemRuntimeCheckCond = nullptr;
1869 
1870   DominatorTree *DT;
1871   LoopInfo *LI;
1872 
1873   SCEVExpander SCEVExp;
1874   SCEVExpander MemCheckExp;
1875 
1876 public:
1877   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1878                     const DataLayout &DL)
1879       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1880         MemCheckExp(SE, DL, "scev.check") {}
1881 
1882   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1883   /// accurately estimate the cost of the runtime checks. The blocks are
1884   /// un-linked from the IR and is added back during vector code generation. If
1885   /// there is no vector code generation, the check blocks are removed
1886   /// completely.
1887   void Create(Loop *L, const LoopAccessInfo &LAI,
1888               const SCEVUnionPredicate &UnionPred) {
1889 
1890     BasicBlock *LoopHeader = L->getHeader();
1891     BasicBlock *Preheader = L->getLoopPreheader();
1892 
1893     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1894     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1895     // may be used by SCEVExpander. The blocks will be un-linked from their
1896     // predecessors and removed from LI & DT at the end of the function.
1897     if (!UnionPred.isAlwaysTrue()) {
1898       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1899                                   nullptr, "vector.scevcheck");
1900 
1901       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1902           &UnionPred, SCEVCheckBlock->getTerminator());
1903     }
1904 
1905     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1906     if (RtPtrChecking.Need) {
1907       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1908       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1909                                  "vector.memcheck");
1910 
1911       std::tie(std::ignore, MemRuntimeCheckCond) =
1912           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1913                            RtPtrChecking.getChecks(), MemCheckExp);
1914       assert(MemRuntimeCheckCond &&
1915              "no RT checks generated although RtPtrChecking "
1916              "claimed checks are required");
1917     }
1918 
1919     if (!MemCheckBlock && !SCEVCheckBlock)
1920       return;
1921 
1922     // Unhook the temporary block with the checks, update various places
1923     // accordingly.
1924     if (SCEVCheckBlock)
1925       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1926     if (MemCheckBlock)
1927       MemCheckBlock->replaceAllUsesWith(Preheader);
1928 
1929     if (SCEVCheckBlock) {
1930       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1931       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1932       Preheader->getTerminator()->eraseFromParent();
1933     }
1934     if (MemCheckBlock) {
1935       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1936       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1937       Preheader->getTerminator()->eraseFromParent();
1938     }
1939 
1940     DT->changeImmediateDominator(LoopHeader, Preheader);
1941     if (MemCheckBlock) {
1942       DT->eraseNode(MemCheckBlock);
1943       LI->removeBlock(MemCheckBlock);
1944     }
1945     if (SCEVCheckBlock) {
1946       DT->eraseNode(SCEVCheckBlock);
1947       LI->removeBlock(SCEVCheckBlock);
1948     }
1949   }
1950 
1951   /// Remove the created SCEV & memory runtime check blocks & instructions, if
1952   /// unused.
1953   ~GeneratedRTChecks() {
1954     SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
1955     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
1956     if (!SCEVCheckCond)
1957       SCEVCleaner.markResultUsed();
1958 
1959     if (!MemRuntimeCheckCond)
1960       MemCheckCleaner.markResultUsed();
1961 
1962     if (MemRuntimeCheckCond) {
1963       auto &SE = *MemCheckExp.getSE();
1964       // Memory runtime check generation creates compares that use expanded
1965       // values. Remove them before running the SCEVExpanderCleaners.
1966       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
1967         if (MemCheckExp.isInsertedInstruction(&I))
1968           continue;
1969         SE.forgetValue(&I);
1970         SE.eraseValueFromMap(&I);
1971         I.eraseFromParent();
1972       }
1973     }
1974     MemCheckCleaner.cleanup();
1975     SCEVCleaner.cleanup();
1976 
1977     if (SCEVCheckCond)
1978       SCEVCheckBlock->eraseFromParent();
1979     if (MemRuntimeCheckCond)
1980       MemCheckBlock->eraseFromParent();
1981   }
1982 
1983   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
1984   /// adjusts the branches to branch to the vector preheader or \p Bypass,
1985   /// depending on the generated condition.
1986   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
1987                              BasicBlock *LoopVectorPreHeader,
1988                              BasicBlock *LoopExitBlock) {
1989     if (!SCEVCheckCond)
1990       return nullptr;
1991     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
1992       if (C->isZero())
1993         return nullptr;
1994 
1995     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
1996 
1997     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
1998     // Create new preheader for vector loop.
1999     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2000       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2001 
2002     SCEVCheckBlock->getTerminator()->eraseFromParent();
2003     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2004     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2005                                                 SCEVCheckBlock);
2006 
2007     DT->addNewBlock(SCEVCheckBlock, Pred);
2008     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2009 
2010     ReplaceInstWithInst(
2011         SCEVCheckBlock->getTerminator(),
2012         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2013     // Mark the check as used, to prevent it from being removed during cleanup.
2014     SCEVCheckCond = nullptr;
2015     return SCEVCheckBlock;
2016   }
2017 
2018   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2019   /// the branches to branch to the vector preheader or \p Bypass, depending on
2020   /// the generated condition.
2021   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2022                                    BasicBlock *LoopVectorPreHeader) {
2023     // Check if we generated code that checks in runtime if arrays overlap.
2024     if (!MemRuntimeCheckCond)
2025       return nullptr;
2026 
2027     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2028     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2029                                                 MemCheckBlock);
2030 
2031     DT->addNewBlock(MemCheckBlock, Pred);
2032     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2033     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2034 
2035     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2036       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2037 
2038     ReplaceInstWithInst(
2039         MemCheckBlock->getTerminator(),
2040         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2041     MemCheckBlock->getTerminator()->setDebugLoc(
2042         Pred->getTerminator()->getDebugLoc());
2043 
2044     // Mark the check as used, to prevent it from being removed during cleanup.
2045     MemRuntimeCheckCond = nullptr;
2046     return MemCheckBlock;
2047   }
2048 };
2049 
2050 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2051 // vectorization. The loop needs to be annotated with #pragma omp simd
2052 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2053 // vector length information is not provided, vectorization is not considered
2054 // explicit. Interleave hints are not allowed either. These limitations will be
2055 // relaxed in the future.
2056 // Please, note that we are currently forced to abuse the pragma 'clang
2057 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2058 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2059 // provides *explicit vectorization hints* (LV can bypass legal checks and
2060 // assume that vectorization is legal). However, both hints are implemented
2061 // using the same metadata (llvm.loop.vectorize, processed by
2062 // LoopVectorizeHints). This will be fixed in the future when the native IR
2063 // representation for pragma 'omp simd' is introduced.
2064 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2065                                    OptimizationRemarkEmitter *ORE) {
2066   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2067   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2068 
2069   // Only outer loops with an explicit vectorization hint are supported.
2070   // Unannotated outer loops are ignored.
2071   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2072     return false;
2073 
2074   Function *Fn = OuterLp->getHeader()->getParent();
2075   if (!Hints.allowVectorization(Fn, OuterLp,
2076                                 true /*VectorizeOnlyWhenForced*/)) {
2077     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2078     return false;
2079   }
2080 
2081   if (Hints.getInterleave() > 1) {
2082     // TODO: Interleave support is future work.
2083     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2084                          "outer loops.\n");
2085     Hints.emitRemarkWithHints();
2086     return false;
2087   }
2088 
2089   return true;
2090 }
2091 
2092 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2093                                   OptimizationRemarkEmitter *ORE,
2094                                   SmallVectorImpl<Loop *> &V) {
2095   // Collect inner loops and outer loops without irreducible control flow. For
2096   // now, only collect outer loops that have explicit vectorization hints. If we
2097   // are stress testing the VPlan H-CFG construction, we collect the outermost
2098   // loop of every loop nest.
2099   if (L.isInnermost() || VPlanBuildStressTest ||
2100       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2101     LoopBlocksRPO RPOT(&L);
2102     RPOT.perform(LI);
2103     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2104       V.push_back(&L);
2105       // TODO: Collect inner loops inside marked outer loops in case
2106       // vectorization fails for the outer loop. Do not invoke
2107       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2108       // already known to be reducible. We can use an inherited attribute for
2109       // that.
2110       return;
2111     }
2112   }
2113   for (Loop *InnerL : L)
2114     collectSupportedLoops(*InnerL, LI, ORE, V);
2115 }
2116 
2117 namespace {
2118 
2119 /// The LoopVectorize Pass.
2120 struct LoopVectorize : public FunctionPass {
2121   /// Pass identification, replacement for typeid
2122   static char ID;
2123 
2124   LoopVectorizePass Impl;
2125 
2126   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2127                          bool VectorizeOnlyWhenForced = false)
2128       : FunctionPass(ID),
2129         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2130     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2131   }
2132 
2133   bool runOnFunction(Function &F) override {
2134     if (skipFunction(F))
2135       return false;
2136 
2137     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2138     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2139     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2140     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2141     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2142     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2143     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2144     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2145     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2146     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2147     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2148     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2149     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2150 
2151     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2152         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2153 
2154     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2155                         GetLAA, *ORE, PSI).MadeAnyChange;
2156   }
2157 
2158   void getAnalysisUsage(AnalysisUsage &AU) const override {
2159     AU.addRequired<AssumptionCacheTracker>();
2160     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2161     AU.addRequired<DominatorTreeWrapperPass>();
2162     AU.addRequired<LoopInfoWrapperPass>();
2163     AU.addRequired<ScalarEvolutionWrapperPass>();
2164     AU.addRequired<TargetTransformInfoWrapperPass>();
2165     AU.addRequired<AAResultsWrapperPass>();
2166     AU.addRequired<LoopAccessLegacyAnalysis>();
2167     AU.addRequired<DemandedBitsWrapperPass>();
2168     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2169     AU.addRequired<InjectTLIMappingsLegacy>();
2170 
2171     // We currently do not preserve loopinfo/dominator analyses with outer loop
2172     // vectorization. Until this is addressed, mark these analyses as preserved
2173     // only for non-VPlan-native path.
2174     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2175     if (!EnableVPlanNativePath) {
2176       AU.addPreserved<LoopInfoWrapperPass>();
2177       AU.addPreserved<DominatorTreeWrapperPass>();
2178     }
2179 
2180     AU.addPreserved<BasicAAWrapperPass>();
2181     AU.addPreserved<GlobalsAAWrapperPass>();
2182     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2183   }
2184 };
2185 
2186 } // end anonymous namespace
2187 
2188 //===----------------------------------------------------------------------===//
2189 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2190 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2191 //===----------------------------------------------------------------------===//
2192 
2193 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2194   // We need to place the broadcast of invariant variables outside the loop,
2195   // but only if it's proven safe to do so. Else, broadcast will be inside
2196   // vector loop body.
2197   Instruction *Instr = dyn_cast<Instruction>(V);
2198   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2199                      (!Instr ||
2200                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2201   // Place the code for broadcasting invariant variables in the new preheader.
2202   IRBuilder<>::InsertPointGuard Guard(Builder);
2203   if (SafeToHoist)
2204     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2205 
2206   // Broadcast the scalar into all locations in the vector.
2207   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2208 
2209   return Shuf;
2210 }
2211 
2212 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2213     const InductionDescriptor &II, Value *Step, Value *Start,
2214     Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
2215     VPTransformState &State) {
2216   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2217          "Expected either an induction phi-node or a truncate of it!");
2218 
2219   // Construct the initial value of the vector IV in the vector loop preheader
2220   auto CurrIP = Builder.saveIP();
2221   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2222   if (isa<TruncInst>(EntryVal)) {
2223     assert(Start->getType()->isIntegerTy() &&
2224            "Truncation requires an integer type");
2225     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2226     Step = Builder.CreateTrunc(Step, TruncType);
2227     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2228   }
2229   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2230   Value *SteppedStart =
2231       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2232 
2233   // We create vector phi nodes for both integer and floating-point induction
2234   // variables. Here, we determine the kind of arithmetic we will perform.
2235   Instruction::BinaryOps AddOp;
2236   Instruction::BinaryOps MulOp;
2237   if (Step->getType()->isIntegerTy()) {
2238     AddOp = Instruction::Add;
2239     MulOp = Instruction::Mul;
2240   } else {
2241     AddOp = II.getInductionOpcode();
2242     MulOp = Instruction::FMul;
2243   }
2244 
2245   // Multiply the vectorization factor by the step using integer or
2246   // floating-point arithmetic as appropriate.
2247   Value *ConstVF =
2248       getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
2249   Value *Mul = Builder.CreateBinOp(MulOp, Step, ConstVF);
2250 
2251   // Create a vector splat to use in the induction update.
2252   //
2253   // FIXME: If the step is non-constant, we create the vector splat with
2254   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2255   //        handle a constant vector splat.
2256   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2257   Value *SplatVF = isa<Constant>(Mul)
2258                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2259                        : Builder.CreateVectorSplat(VF, Mul);
2260   Builder.restoreIP(CurrIP);
2261 
2262   // We may need to add the step a number of times, depending on the unroll
2263   // factor. The last of those goes into the PHI.
2264   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2265                                     &*LoopVectorBody->getFirstInsertionPt());
2266   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2267   Instruction *LastInduction = VecInd;
2268   for (unsigned Part = 0; Part < UF; ++Part) {
2269     State.set(Def, LastInduction, Part);
2270 
2271     if (isa<TruncInst>(EntryVal))
2272       addMetadata(LastInduction, EntryVal);
2273     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
2274                                           State, Part);
2275 
2276     LastInduction = cast<Instruction>(
2277         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2278     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2279   }
2280 
2281   // Move the last step to the end of the latch block. This ensures consistent
2282   // placement of all induction updates.
2283   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2284   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2285   auto *ICmp = cast<Instruction>(Br->getCondition());
2286   LastInduction->moveBefore(ICmp);
2287   LastInduction->setName("vec.ind.next");
2288 
2289   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2290   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2291 }
2292 
2293 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2294   return Cost->isScalarAfterVectorization(I, VF) ||
2295          Cost->isProfitableToScalarize(I, VF);
2296 }
2297 
2298 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2299   if (shouldScalarizeInstruction(IV))
2300     return true;
2301   auto isScalarInst = [&](User *U) -> bool {
2302     auto *I = cast<Instruction>(U);
2303     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2304   };
2305   return llvm::any_of(IV->users(), isScalarInst);
2306 }
2307 
2308 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2309     const InductionDescriptor &ID, const Instruction *EntryVal,
2310     Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
2311     unsigned Part, unsigned Lane) {
2312   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2313          "Expected either an induction phi-node or a truncate of it!");
2314 
2315   // This induction variable is not the phi from the original loop but the
2316   // newly-created IV based on the proof that casted Phi is equal to the
2317   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2318   // re-uses the same InductionDescriptor that original IV uses but we don't
2319   // have to do any recording in this case - that is done when original IV is
2320   // processed.
2321   if (isa<TruncInst>(EntryVal))
2322     return;
2323 
2324   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2325   if (Casts.empty())
2326     return;
2327   // Only the first Cast instruction in the Casts vector is of interest.
2328   // The rest of the Casts (if exist) have no uses outside the
2329   // induction update chain itself.
2330   if (Lane < UINT_MAX)
2331     State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
2332   else
2333     State.set(CastDef, VectorLoopVal, Part);
2334 }
2335 
2336 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2337                                                 TruncInst *Trunc, VPValue *Def,
2338                                                 VPValue *CastDef,
2339                                                 VPTransformState &State) {
2340   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2341          "Primary induction variable must have an integer type");
2342 
2343   auto II = Legal->getInductionVars().find(IV);
2344   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2345 
2346   auto ID = II->second;
2347   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2348 
2349   // The value from the original loop to which we are mapping the new induction
2350   // variable.
2351   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2352 
2353   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2354 
2355   // Generate code for the induction step. Note that induction steps are
2356   // required to be loop-invariant
2357   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2358     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2359            "Induction step should be loop invariant");
2360     if (PSE.getSE()->isSCEVable(IV->getType())) {
2361       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2362       return Exp.expandCodeFor(Step, Step->getType(),
2363                                LoopVectorPreHeader->getTerminator());
2364     }
2365     return cast<SCEVUnknown>(Step)->getValue();
2366   };
2367 
2368   // The scalar value to broadcast. This is derived from the canonical
2369   // induction variable. If a truncation type is given, truncate the canonical
2370   // induction variable and step. Otherwise, derive these values from the
2371   // induction descriptor.
2372   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2373     Value *ScalarIV = Induction;
2374     if (IV != OldInduction) {
2375       ScalarIV = IV->getType()->isIntegerTy()
2376                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2377                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2378                                           IV->getType());
2379       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2380       ScalarIV->setName("offset.idx");
2381     }
2382     if (Trunc) {
2383       auto *TruncType = cast<IntegerType>(Trunc->getType());
2384       assert(Step->getType()->isIntegerTy() &&
2385              "Truncation requires an integer step");
2386       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2387       Step = Builder.CreateTrunc(Step, TruncType);
2388     }
2389     return ScalarIV;
2390   };
2391 
2392   // Create the vector values from the scalar IV, in the absence of creating a
2393   // vector IV.
2394   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2395     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2396     for (unsigned Part = 0; Part < UF; ++Part) {
2397       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2398       Value *EntryPart =
2399           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2400                         ID.getInductionOpcode());
2401       State.set(Def, EntryPart, Part);
2402       if (Trunc)
2403         addMetadata(EntryPart, Trunc);
2404       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
2405                                             State, Part);
2406     }
2407   };
2408 
2409   // Fast-math-flags propagate from the original induction instruction.
2410   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2411   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2412     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2413 
2414   // Now do the actual transformations, and start with creating the step value.
2415   Value *Step = CreateStepValue(ID.getStep());
2416   if (VF.isZero() || VF.isScalar()) {
2417     Value *ScalarIV = CreateScalarIV(Step);
2418     CreateSplatIV(ScalarIV, Step);
2419     return;
2420   }
2421 
2422   // Determine if we want a scalar version of the induction variable. This is
2423   // true if the induction variable itself is not widened, or if it has at
2424   // least one user in the loop that is not widened.
2425   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2426   if (!NeedsScalarIV) {
2427     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2428                                     State);
2429     return;
2430   }
2431 
2432   // Try to create a new independent vector induction variable. If we can't
2433   // create the phi node, we will splat the scalar induction variable in each
2434   // loop iteration.
2435   if (!shouldScalarizeInstruction(EntryVal)) {
2436     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2437                                     State);
2438     Value *ScalarIV = CreateScalarIV(Step);
2439     // Create scalar steps that can be used by instructions we will later
2440     // scalarize. Note that the addition of the scalar steps will not increase
2441     // the number of instructions in the loop in the common case prior to
2442     // InstCombine. We will be trading one vector extract for each scalar step.
2443     buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2444     return;
2445   }
2446 
2447   // All IV users are scalar instructions, so only emit a scalar IV, not a
2448   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2449   // predicate used by the masked loads/stores.
2450   Value *ScalarIV = CreateScalarIV(Step);
2451   if (!Cost->isScalarEpilogueAllowed())
2452     CreateSplatIV(ScalarIV, Step);
2453   buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2454 }
2455 
2456 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2457                                           Instruction::BinaryOps BinOp) {
2458   // Create and check the types.
2459   auto *ValVTy = cast<FixedVectorType>(Val->getType());
2460   int VLen = ValVTy->getNumElements();
2461 
2462   Type *STy = Val->getType()->getScalarType();
2463   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2464          "Induction Step must be an integer or FP");
2465   assert(Step->getType() == STy && "Step has wrong type");
2466 
2467   SmallVector<Constant *, 8> Indices;
2468 
2469   if (STy->isIntegerTy()) {
2470     // Create a vector of consecutive numbers from zero to VF.
2471     for (int i = 0; i < VLen; ++i)
2472       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
2473 
2474     // Add the consecutive indices to the vector value.
2475     Constant *Cv = ConstantVector::get(Indices);
2476     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
2477     Step = Builder.CreateVectorSplat(VLen, Step);
2478     assert(Step->getType() == Val->getType() && "Invalid step vec");
2479     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2480     // which can be found from the original scalar operations.
2481     Step = Builder.CreateMul(Cv, Step);
2482     return Builder.CreateAdd(Val, Step, "induction");
2483   }
2484 
2485   // Floating point induction.
2486   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2487          "Binary Opcode should be specified for FP induction");
2488   // Create a vector of consecutive numbers from zero to VF.
2489   for (int i = 0; i < VLen; ++i)
2490     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
2491 
2492   // Add the consecutive indices to the vector value.
2493   // Floating-point operations inherit FMF via the builder's flags.
2494   Constant *Cv = ConstantVector::get(Indices);
2495   Step = Builder.CreateVectorSplat(VLen, Step);
2496   Value *MulOp = Builder.CreateFMul(Cv, Step);
2497   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2498 }
2499 
2500 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2501                                            Instruction *EntryVal,
2502                                            const InductionDescriptor &ID,
2503                                            VPValue *Def, VPValue *CastDef,
2504                                            VPTransformState &State) {
2505   // We shouldn't have to build scalar steps if we aren't vectorizing.
2506   assert(VF.isVector() && "VF should be greater than one");
2507   // Get the value type and ensure it and the step have the same integer type.
2508   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2509   assert(ScalarIVTy == Step->getType() &&
2510          "Val and Step should have the same type");
2511 
2512   // We build scalar steps for both integer and floating-point induction
2513   // variables. Here, we determine the kind of arithmetic we will perform.
2514   Instruction::BinaryOps AddOp;
2515   Instruction::BinaryOps MulOp;
2516   if (ScalarIVTy->isIntegerTy()) {
2517     AddOp = Instruction::Add;
2518     MulOp = Instruction::Mul;
2519   } else {
2520     AddOp = ID.getInductionOpcode();
2521     MulOp = Instruction::FMul;
2522   }
2523 
2524   // Determine the number of scalars we need to generate for each unroll
2525   // iteration. If EntryVal is uniform, we only need to generate the first
2526   // lane. Otherwise, we generate all VF values.
2527   unsigned Lanes =
2528       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
2529           ? 1
2530           : VF.getKnownMinValue();
2531   assert((!VF.isScalable() || Lanes == 1) &&
2532          "Should never scalarize a scalable vector");
2533   // Compute the scalar steps and save the results in State.
2534   for (unsigned Part = 0; Part < UF; ++Part) {
2535     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2536       auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2537                                          ScalarIVTy->getScalarSizeInBits());
2538       Value *StartIdx =
2539           createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2540       if (ScalarIVTy->isFloatingPointTy())
2541         StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
2542       StartIdx = Builder.CreateBinOp(
2543           AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2544       // The step returned by `createStepForVF` is a runtime-evaluated value
2545       // when VF is scalable. Otherwise, it should be folded into a Constant.
2546       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2547              "Expected StartIdx to be folded to a constant when VF is not "
2548              "scalable");
2549       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2550       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2551       State.set(Def, Add, VPIteration(Part, Lane));
2552       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2553                                             Part, Lane);
2554     }
2555   }
2556 }
2557 
2558 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2559                                                     const VPIteration &Instance,
2560                                                     VPTransformState &State) {
2561   Value *ScalarInst = State.get(Def, Instance);
2562   Value *VectorValue = State.get(Def, Instance.Part);
2563   VectorValue = Builder.CreateInsertElement(
2564       VectorValue, ScalarInst,
2565       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2566   State.set(Def, VectorValue, Instance.Part);
2567 }
2568 
2569 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2570   assert(Vec->getType()->isVectorTy() && "Invalid type");
2571   return Builder.CreateVectorReverse(Vec, "reverse");
2572 }
2573 
2574 // Return whether we allow using masked interleave-groups (for dealing with
2575 // strided loads/stores that reside in predicated blocks, or for dealing
2576 // with gaps).
2577 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2578   // If an override option has been passed in for interleaved accesses, use it.
2579   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2580     return EnableMaskedInterleavedMemAccesses;
2581 
2582   return TTI.enableMaskedInterleavedAccessVectorization();
2583 }
2584 
2585 // Try to vectorize the interleave group that \p Instr belongs to.
2586 //
2587 // E.g. Translate following interleaved load group (factor = 3):
2588 //   for (i = 0; i < N; i+=3) {
2589 //     R = Pic[i];             // Member of index 0
2590 //     G = Pic[i+1];           // Member of index 1
2591 //     B = Pic[i+2];           // Member of index 2
2592 //     ... // do something to R, G, B
2593 //   }
2594 // To:
2595 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2596 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2597 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2598 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2599 //
2600 // Or translate following interleaved store group (factor = 3):
2601 //   for (i = 0; i < N; i+=3) {
2602 //     ... do something to R, G, B
2603 //     Pic[i]   = R;           // Member of index 0
2604 //     Pic[i+1] = G;           // Member of index 1
2605 //     Pic[i+2] = B;           // Member of index 2
2606 //   }
2607 // To:
2608 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2609 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2610 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2611 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2612 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2613 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2614     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2615     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2616     VPValue *BlockInMask) {
2617   Instruction *Instr = Group->getInsertPos();
2618   const DataLayout &DL = Instr->getModule()->getDataLayout();
2619 
2620   // Prepare for the vector type of the interleaved load/store.
2621   Type *ScalarTy = getMemInstValueType(Instr);
2622   unsigned InterleaveFactor = Group->getFactor();
2623   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2624   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2625 
2626   // Prepare for the new pointers.
2627   SmallVector<Value *, 2> AddrParts;
2628   unsigned Index = Group->getIndex(Instr);
2629 
2630   // TODO: extend the masked interleaved-group support to reversed access.
2631   assert((!BlockInMask || !Group->isReverse()) &&
2632          "Reversed masked interleave-group not supported.");
2633 
2634   // If the group is reverse, adjust the index to refer to the last vector lane
2635   // instead of the first. We adjust the index from the first vector lane,
2636   // rather than directly getting the pointer for lane VF - 1, because the
2637   // pointer operand of the interleaved access is supposed to be uniform. For
2638   // uniform instructions, we're only required to generate a value for the
2639   // first vector lane in each unroll iteration.
2640   assert(!VF.isScalable() &&
2641          "scalable vector reverse operation is not implemented");
2642   if (Group->isReverse())
2643     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2644 
2645   for (unsigned Part = 0; Part < UF; Part++) {
2646     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2647     setDebugLocFromInst(Builder, AddrPart);
2648 
2649     // Notice current instruction could be any index. Need to adjust the address
2650     // to the member of index 0.
2651     //
2652     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2653     //       b = A[i];       // Member of index 0
2654     // Current pointer is pointed to A[i+1], adjust it to A[i].
2655     //
2656     // E.g.  A[i+1] = a;     // Member of index 1
2657     //       A[i]   = b;     // Member of index 0
2658     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2659     // Current pointer is pointed to A[i+2], adjust it to A[i].
2660 
2661     bool InBounds = false;
2662     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2663       InBounds = gep->isInBounds();
2664     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2665     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2666 
2667     // Cast to the vector pointer type.
2668     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2669     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2670     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2671   }
2672 
2673   setDebugLocFromInst(Builder, Instr);
2674   Value *PoisonVec = PoisonValue::get(VecTy);
2675 
2676   Value *MaskForGaps = nullptr;
2677   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2678     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2679     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2680     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2681   }
2682 
2683   // Vectorize the interleaved load group.
2684   if (isa<LoadInst>(Instr)) {
2685     // For each unroll part, create a wide load for the group.
2686     SmallVector<Value *, 2> NewLoads;
2687     for (unsigned Part = 0; Part < UF; Part++) {
2688       Instruction *NewLoad;
2689       if (BlockInMask || MaskForGaps) {
2690         assert(useMaskedInterleavedAccesses(*TTI) &&
2691                "masked interleaved groups are not allowed.");
2692         Value *GroupMask = MaskForGaps;
2693         if (BlockInMask) {
2694           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2695           assert(!VF.isScalable() && "scalable vectors not yet supported.");
2696           Value *ShuffledMask = Builder.CreateShuffleVector(
2697               BlockInMaskPart,
2698               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2699               "interleaved.mask");
2700           GroupMask = MaskForGaps
2701                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2702                                                 MaskForGaps)
2703                           : ShuffledMask;
2704         }
2705         NewLoad =
2706             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2707                                      GroupMask, PoisonVec, "wide.masked.vec");
2708       }
2709       else
2710         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2711                                             Group->getAlign(), "wide.vec");
2712       Group->addMetadata(NewLoad);
2713       NewLoads.push_back(NewLoad);
2714     }
2715 
2716     // For each member in the group, shuffle out the appropriate data from the
2717     // wide loads.
2718     unsigned J = 0;
2719     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2720       Instruction *Member = Group->getMember(I);
2721 
2722       // Skip the gaps in the group.
2723       if (!Member)
2724         continue;
2725 
2726       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2727       auto StrideMask =
2728           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2729       for (unsigned Part = 0; Part < UF; Part++) {
2730         Value *StridedVec = Builder.CreateShuffleVector(
2731             NewLoads[Part], StrideMask, "strided.vec");
2732 
2733         // If this member has different type, cast the result type.
2734         if (Member->getType() != ScalarTy) {
2735           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2736           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2737           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2738         }
2739 
2740         if (Group->isReverse())
2741           StridedVec = reverseVector(StridedVec);
2742 
2743         State.set(VPDefs[J], StridedVec, Part);
2744       }
2745       ++J;
2746     }
2747     return;
2748   }
2749 
2750   // The sub vector type for current instruction.
2751   assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2752   auto *SubVT = VectorType::get(ScalarTy, VF);
2753 
2754   // Vectorize the interleaved store group.
2755   for (unsigned Part = 0; Part < UF; Part++) {
2756     // Collect the stored vector from each member.
2757     SmallVector<Value *, 4> StoredVecs;
2758     for (unsigned i = 0; i < InterleaveFactor; i++) {
2759       // Interleaved store group doesn't allow a gap, so each index has a member
2760       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2761 
2762       Value *StoredVec = State.get(StoredValues[i], Part);
2763 
2764       if (Group->isReverse())
2765         StoredVec = reverseVector(StoredVec);
2766 
2767       // If this member has different type, cast it to a unified type.
2768 
2769       if (StoredVec->getType() != SubVT)
2770         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2771 
2772       StoredVecs.push_back(StoredVec);
2773     }
2774 
2775     // Concatenate all vectors into a wide vector.
2776     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2777 
2778     // Interleave the elements in the wide vector.
2779     assert(!VF.isScalable() && "scalable vectors not yet supported.");
2780     Value *IVec = Builder.CreateShuffleVector(
2781         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2782         "interleaved.vec");
2783 
2784     Instruction *NewStoreInstr;
2785     if (BlockInMask) {
2786       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2787       Value *ShuffledMask = Builder.CreateShuffleVector(
2788           BlockInMaskPart,
2789           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2790           "interleaved.mask");
2791       NewStoreInstr = Builder.CreateMaskedStore(
2792           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2793     }
2794     else
2795       NewStoreInstr =
2796           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2797 
2798     Group->addMetadata(NewStoreInstr);
2799   }
2800 }
2801 
2802 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2803     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2804     VPValue *StoredValue, VPValue *BlockInMask) {
2805   // Attempt to issue a wide load.
2806   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2807   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2808 
2809   assert((LI || SI) && "Invalid Load/Store instruction");
2810   assert((!SI || StoredValue) && "No stored value provided for widened store");
2811   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2812 
2813   LoopVectorizationCostModel::InstWidening Decision =
2814       Cost->getWideningDecision(Instr, VF);
2815   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2816           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2817           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2818          "CM decision is not to widen the memory instruction");
2819 
2820   Type *ScalarDataTy = getMemInstValueType(Instr);
2821 
2822   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2823   const Align Alignment = getLoadStoreAlignment(Instr);
2824 
2825   // Determine if the pointer operand of the access is either consecutive or
2826   // reverse consecutive.
2827   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2828   bool ConsecutiveStride =
2829       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2830   bool CreateGatherScatter =
2831       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2832 
2833   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2834   // gather/scatter. Otherwise Decision should have been to Scalarize.
2835   assert((ConsecutiveStride || CreateGatherScatter) &&
2836          "The instruction should be scalarized");
2837   (void)ConsecutiveStride;
2838 
2839   VectorParts BlockInMaskParts(UF);
2840   bool isMaskRequired = BlockInMask;
2841   if (isMaskRequired)
2842     for (unsigned Part = 0; Part < UF; ++Part)
2843       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2844 
2845   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2846     // Calculate the pointer for the specific unroll-part.
2847     GetElementPtrInst *PartPtr = nullptr;
2848 
2849     bool InBounds = false;
2850     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2851       InBounds = gep->isInBounds();
2852     if (Reverse) {
2853       // If the address is consecutive but reversed, then the
2854       // wide store needs to start at the last vector element.
2855       // RunTimeVF =  VScale * VF.getKnownMinValue()
2856       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
2857       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2858       // NumElt = -Part * RunTimeVF
2859       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
2860       // LastLane = 1 - RunTimeVF
2861       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
2862       PartPtr =
2863           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
2864       PartPtr->setIsInBounds(InBounds);
2865       PartPtr = cast<GetElementPtrInst>(
2866           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
2867       PartPtr->setIsInBounds(InBounds);
2868       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2869         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2870     } else {
2871       Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2872       PartPtr = cast<GetElementPtrInst>(
2873           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2874       PartPtr->setIsInBounds(InBounds);
2875     }
2876 
2877     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2878     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2879   };
2880 
2881   // Handle Stores:
2882   if (SI) {
2883     setDebugLocFromInst(Builder, SI);
2884 
2885     for (unsigned Part = 0; Part < UF; ++Part) {
2886       Instruction *NewSI = nullptr;
2887       Value *StoredVal = State.get(StoredValue, Part);
2888       if (CreateGatherScatter) {
2889         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2890         Value *VectorGep = State.get(Addr, Part);
2891         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2892                                             MaskPart);
2893       } else {
2894         if (Reverse) {
2895           // If we store to reverse consecutive memory locations, then we need
2896           // to reverse the order of elements in the stored value.
2897           StoredVal = reverseVector(StoredVal);
2898           // We don't want to update the value in the map as it might be used in
2899           // another expression. So don't call resetVectorValue(StoredVal).
2900         }
2901         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2902         if (isMaskRequired)
2903           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2904                                             BlockInMaskParts[Part]);
2905         else
2906           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2907       }
2908       addMetadata(NewSI, SI);
2909     }
2910     return;
2911   }
2912 
2913   // Handle loads.
2914   assert(LI && "Must have a load instruction");
2915   setDebugLocFromInst(Builder, LI);
2916   for (unsigned Part = 0; Part < UF; ++Part) {
2917     Value *NewLI;
2918     if (CreateGatherScatter) {
2919       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2920       Value *VectorGep = State.get(Addr, Part);
2921       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2922                                          nullptr, "wide.masked.gather");
2923       addMetadata(NewLI, LI);
2924     } else {
2925       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2926       if (isMaskRequired)
2927         NewLI = Builder.CreateMaskedLoad(
2928             VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
2929             "wide.masked.load");
2930       else
2931         NewLI =
2932             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2933 
2934       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2935       addMetadata(NewLI, LI);
2936       if (Reverse)
2937         NewLI = reverseVector(NewLI);
2938     }
2939 
2940     State.set(Def, NewLI, Part);
2941   }
2942 }
2943 
2944 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
2945                                                VPUser &User,
2946                                                const VPIteration &Instance,
2947                                                bool IfPredicateInstr,
2948                                                VPTransformState &State) {
2949   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2950 
2951   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2952   // the first lane and part.
2953   if (isa<NoAliasScopeDeclInst>(Instr))
2954     if (!Instance.isFirstIteration())
2955       return;
2956 
2957   setDebugLocFromInst(Builder, Instr);
2958 
2959   // Does this instruction return a value ?
2960   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2961 
2962   Instruction *Cloned = Instr->clone();
2963   if (!IsVoidRetTy)
2964     Cloned->setName(Instr->getName() + ".cloned");
2965 
2966   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
2967                                Builder.GetInsertPoint());
2968   // Replace the operands of the cloned instructions with their scalar
2969   // equivalents in the new loop.
2970   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
2971     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
2972     auto InputInstance = Instance;
2973     if (!Operand || !OrigLoop->contains(Operand) ||
2974         (Cost->isUniformAfterVectorization(Operand, State.VF)))
2975       InputInstance.Lane = VPLane::getFirstLane();
2976     auto *NewOp = State.get(User.getOperand(op), InputInstance);
2977     Cloned->setOperand(op, NewOp);
2978   }
2979   addNewMetadata(Cloned, Instr);
2980 
2981   // Place the cloned scalar in the new loop.
2982   Builder.Insert(Cloned);
2983 
2984   State.set(Def, Cloned, Instance);
2985 
2986   // If we just cloned a new assumption, add it the assumption cache.
2987   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2988     if (II->getIntrinsicID() == Intrinsic::assume)
2989       AC->registerAssumption(II);
2990 
2991   // End if-block.
2992   if (IfPredicateInstr)
2993     PredicatedInstructions.push_back(Cloned);
2994 }
2995 
2996 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2997                                                       Value *End, Value *Step,
2998                                                       Instruction *DL) {
2999   BasicBlock *Header = L->getHeader();
3000   BasicBlock *Latch = L->getLoopLatch();
3001   // As we're just creating this loop, it's possible no latch exists
3002   // yet. If so, use the header as this will be a single block loop.
3003   if (!Latch)
3004     Latch = Header;
3005 
3006   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
3007   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3008   setDebugLocFromInst(Builder, OldInst);
3009   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
3010 
3011   Builder.SetInsertPoint(Latch->getTerminator());
3012   setDebugLocFromInst(Builder, OldInst);
3013 
3014   // Create i+1 and fill the PHINode.
3015   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
3016   Induction->addIncoming(Start, L->getLoopPreheader());
3017   Induction->addIncoming(Next, Latch);
3018   // Create the compare.
3019   Value *ICmp = Builder.CreateICmpEQ(Next, End);
3020   Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3021 
3022   // Now we have two terminators. Remove the old one from the block.
3023   Latch->getTerminator()->eraseFromParent();
3024 
3025   return Induction;
3026 }
3027 
3028 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3029   if (TripCount)
3030     return TripCount;
3031 
3032   assert(L && "Create Trip Count for null loop.");
3033   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3034   // Find the loop boundaries.
3035   ScalarEvolution *SE = PSE.getSE();
3036   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3037   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
3038          "Invalid loop count");
3039 
3040   Type *IdxTy = Legal->getWidestInductionType();
3041   assert(IdxTy && "No type for induction");
3042 
3043   // The exit count might have the type of i64 while the phi is i32. This can
3044   // happen if we have an induction variable that is sign extended before the
3045   // compare. The only way that we get a backedge taken count is that the
3046   // induction variable was signed and as such will not overflow. In such a case
3047   // truncation is legal.
3048   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3049       IdxTy->getPrimitiveSizeInBits())
3050     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3051   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3052 
3053   // Get the total trip count from the count by adding 1.
3054   const SCEV *ExitCount = SE->getAddExpr(
3055       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3056 
3057   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3058 
3059   // Expand the trip count and place the new instructions in the preheader.
3060   // Notice that the pre-header does not change, only the loop body.
3061   SCEVExpander Exp(*SE, DL, "induction");
3062 
3063   // Count holds the overall loop count (N).
3064   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3065                                 L->getLoopPreheader()->getTerminator());
3066 
3067   if (TripCount->getType()->isPointerTy())
3068     TripCount =
3069         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3070                                     L->getLoopPreheader()->getTerminator());
3071 
3072   return TripCount;
3073 }
3074 
3075 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3076   if (VectorTripCount)
3077     return VectorTripCount;
3078 
3079   Value *TC = getOrCreateTripCount(L);
3080   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3081 
3082   Type *Ty = TC->getType();
3083   // This is where we can make the step a runtime constant.
3084   Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
3085 
3086   // If the tail is to be folded by masking, round the number of iterations N
3087   // up to a multiple of Step instead of rounding down. This is done by first
3088   // adding Step-1 and then rounding down. Note that it's ok if this addition
3089   // overflows: the vector induction variable will eventually wrap to zero given
3090   // that it starts at zero and its Step is a power of two; the loop will then
3091   // exit, with the last early-exit vector comparison also producing all-true.
3092   if (Cost->foldTailByMasking()) {
3093     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3094            "VF*UF must be a power of 2 when folding tail by masking");
3095     assert(!VF.isScalable() &&
3096            "Tail folding not yet supported for scalable vectors");
3097     TC = Builder.CreateAdd(
3098         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3099   }
3100 
3101   // Now we need to generate the expression for the part of the loop that the
3102   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3103   // iterations are not required for correctness, or N - Step, otherwise. Step
3104   // is equal to the vectorization factor (number of SIMD elements) times the
3105   // unroll factor (number of SIMD instructions).
3106   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3107 
3108   // There are two cases where we need to ensure (at least) the last iteration
3109   // runs in the scalar remainder loop. Thus, if the step evenly divides
3110   // the trip count, we set the remainder to be equal to the step. If the step
3111   // does not evenly divide the trip count, no adjustment is necessary since
3112   // there will already be scalar iterations. Note that the minimum iterations
3113   // check ensures that N >= Step. The cases are:
3114   // 1) If there is a non-reversed interleaved group that may speculatively
3115   //    access memory out-of-bounds.
3116   // 2) If any instruction may follow a conditionally taken exit. That is, if
3117   //    the loop contains multiple exiting blocks, or a single exiting block
3118   //    which is not the latch.
3119   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3120     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3121     R = Builder.CreateSelect(IsZero, Step, R);
3122   }
3123 
3124   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3125 
3126   return VectorTripCount;
3127 }
3128 
3129 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3130                                                    const DataLayout &DL) {
3131   // Verify that V is a vector type with same number of elements as DstVTy.
3132   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3133   unsigned VF = DstFVTy->getNumElements();
3134   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3135   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3136   Type *SrcElemTy = SrcVecTy->getElementType();
3137   Type *DstElemTy = DstFVTy->getElementType();
3138   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3139          "Vector elements must have same size");
3140 
3141   // Do a direct cast if element types are castable.
3142   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3143     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3144   }
3145   // V cannot be directly casted to desired vector type.
3146   // May happen when V is a floating point vector but DstVTy is a vector of
3147   // pointers or vice-versa. Handle this using a two-step bitcast using an
3148   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3149   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3150          "Only one type should be a pointer type");
3151   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3152          "Only one type should be a floating point type");
3153   Type *IntTy =
3154       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3155   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3156   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3157   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3158 }
3159 
3160 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3161                                                          BasicBlock *Bypass) {
3162   Value *Count = getOrCreateTripCount(L);
3163   // Reuse existing vector loop preheader for TC checks.
3164   // Note that new preheader block is generated for vector loop.
3165   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3166   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3167 
3168   // Generate code to check if the loop's trip count is less than VF * UF, or
3169   // equal to it in case a scalar epilogue is required; this implies that the
3170   // vector trip count is zero. This check also covers the case where adding one
3171   // to the backedge-taken count overflowed leading to an incorrect trip count
3172   // of zero. In this case we will also jump to the scalar loop.
3173   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3174                                           : ICmpInst::ICMP_ULT;
3175 
3176   // If tail is to be folded, vector loop takes care of all iterations.
3177   Value *CheckMinIters = Builder.getFalse();
3178   if (!Cost->foldTailByMasking()) {
3179     Value *Step =
3180         createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3181     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3182   }
3183   // Create new preheader for vector loop.
3184   LoopVectorPreHeader =
3185       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3186                  "vector.ph");
3187 
3188   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3189                                DT->getNode(Bypass)->getIDom()) &&
3190          "TC check is expected to dominate Bypass");
3191 
3192   // Update dominator for Bypass & LoopExit.
3193   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3194   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3195 
3196   ReplaceInstWithInst(
3197       TCCheckBlock->getTerminator(),
3198       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3199   LoopBypassBlocks.push_back(TCCheckBlock);
3200 }
3201 
3202 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3203 
3204   BasicBlock *const SCEVCheckBlock =
3205       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3206   if (!SCEVCheckBlock)
3207     return nullptr;
3208 
3209   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3210            (OptForSizeBasedOnProfile &&
3211             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3212          "Cannot SCEV check stride or overflow when optimizing for size");
3213 
3214 
3215   // Update dominator only if this is first RT check.
3216   if (LoopBypassBlocks.empty()) {
3217     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3218     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3219   }
3220 
3221   LoopBypassBlocks.push_back(SCEVCheckBlock);
3222   AddedSafetyChecks = true;
3223   return SCEVCheckBlock;
3224 }
3225 
3226 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3227                                                       BasicBlock *Bypass) {
3228   // VPlan-native path does not do any analysis for runtime checks currently.
3229   if (EnableVPlanNativePath)
3230     return nullptr;
3231 
3232   BasicBlock *const MemCheckBlock =
3233       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3234 
3235   // Check if we generated code that checks in runtime if arrays overlap. We put
3236   // the checks into a separate block to make the more common case of few
3237   // elements faster.
3238   if (!MemCheckBlock)
3239     return nullptr;
3240 
3241   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3242     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3243            "Cannot emit memory checks when optimizing for size, unless forced "
3244            "to vectorize.");
3245     ORE->emit([&]() {
3246       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3247                                         L->getStartLoc(), L->getHeader())
3248              << "Code-size may be reduced by not forcing "
3249                 "vectorization, or by source-code modifications "
3250                 "eliminating the need for runtime checks "
3251                 "(e.g., adding 'restrict').";
3252     });
3253   }
3254 
3255   LoopBypassBlocks.push_back(MemCheckBlock);
3256 
3257   AddedSafetyChecks = true;
3258 
3259   // We currently don't use LoopVersioning for the actual loop cloning but we
3260   // still use it to add the noalias metadata.
3261   LVer = std::make_unique<LoopVersioning>(
3262       *Legal->getLAI(),
3263       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3264       DT, PSE.getSE());
3265   LVer->prepareNoAliasMetadata();
3266   return MemCheckBlock;
3267 }
3268 
3269 Value *InnerLoopVectorizer::emitTransformedIndex(
3270     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3271     const InductionDescriptor &ID) const {
3272 
3273   SCEVExpander Exp(*SE, DL, "induction");
3274   auto Step = ID.getStep();
3275   auto StartValue = ID.getStartValue();
3276   assert(Index->getType() == Step->getType() &&
3277          "Index type does not match StepValue type");
3278 
3279   // Note: the IR at this point is broken. We cannot use SE to create any new
3280   // SCEV and then expand it, hoping that SCEV's simplification will give us
3281   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3282   // lead to various SCEV crashes. So all we can do is to use builder and rely
3283   // on InstCombine for future simplifications. Here we handle some trivial
3284   // cases only.
3285   auto CreateAdd = [&B](Value *X, Value *Y) {
3286     assert(X->getType() == Y->getType() && "Types don't match!");
3287     if (auto *CX = dyn_cast<ConstantInt>(X))
3288       if (CX->isZero())
3289         return Y;
3290     if (auto *CY = dyn_cast<ConstantInt>(Y))
3291       if (CY->isZero())
3292         return X;
3293     return B.CreateAdd(X, Y);
3294   };
3295 
3296   auto CreateMul = [&B](Value *X, Value *Y) {
3297     assert(X->getType() == Y->getType() && "Types don't match!");
3298     if (auto *CX = dyn_cast<ConstantInt>(X))
3299       if (CX->isOne())
3300         return Y;
3301     if (auto *CY = dyn_cast<ConstantInt>(Y))
3302       if (CY->isOne())
3303         return X;
3304     return B.CreateMul(X, Y);
3305   };
3306 
3307   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3308   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3309   // the DomTree is not kept up-to-date for additional blocks generated in the
3310   // vector loop. By using the header as insertion point, we guarantee that the
3311   // expanded instructions dominate all their uses.
3312   auto GetInsertPoint = [this, &B]() {
3313     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3314     if (InsertBB != LoopVectorBody &&
3315         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3316       return LoopVectorBody->getTerminator();
3317     return &*B.GetInsertPoint();
3318   };
3319 
3320   switch (ID.getKind()) {
3321   case InductionDescriptor::IK_IntInduction: {
3322     assert(Index->getType() == StartValue->getType() &&
3323            "Index type does not match StartValue type");
3324     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3325       return B.CreateSub(StartValue, Index);
3326     auto *Offset = CreateMul(
3327         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3328     return CreateAdd(StartValue, Offset);
3329   }
3330   case InductionDescriptor::IK_PtrInduction: {
3331     assert(isa<SCEVConstant>(Step) &&
3332            "Expected constant step for pointer induction");
3333     return B.CreateGEP(
3334         StartValue->getType()->getPointerElementType(), StartValue,
3335         CreateMul(Index,
3336                   Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3337   }
3338   case InductionDescriptor::IK_FpInduction: {
3339     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3340     auto InductionBinOp = ID.getInductionBinOp();
3341     assert(InductionBinOp &&
3342            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3343             InductionBinOp->getOpcode() == Instruction::FSub) &&
3344            "Original bin op should be defined for FP induction");
3345 
3346     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3347     Value *MulExp = B.CreateFMul(StepValue, Index);
3348     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3349                          "induction");
3350   }
3351   case InductionDescriptor::IK_NoInduction:
3352     return nullptr;
3353   }
3354   llvm_unreachable("invalid enum");
3355 }
3356 
3357 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3358   LoopScalarBody = OrigLoop->getHeader();
3359   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3360   LoopExitBlock = OrigLoop->getUniqueExitBlock();
3361   assert(LoopExitBlock && "Must have an exit block");
3362   assert(LoopVectorPreHeader && "Invalid loop structure");
3363 
3364   LoopMiddleBlock =
3365       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3366                  LI, nullptr, Twine(Prefix) + "middle.block");
3367   LoopScalarPreHeader =
3368       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3369                  nullptr, Twine(Prefix) + "scalar.ph");
3370 
3371   // Set up branch from middle block to the exit and scalar preheader blocks.
3372   // completeLoopSkeleton will update the condition to use an iteration check,
3373   // if required to decide whether to execute the remainder.
3374   BranchInst *BrInst =
3375       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
3376   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3377   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3378   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3379 
3380   // We intentionally don't let SplitBlock to update LoopInfo since
3381   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3382   // LoopVectorBody is explicitly added to the correct place few lines later.
3383   LoopVectorBody =
3384       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3385                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3386 
3387   // Update dominator for loop exit.
3388   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3389 
3390   // Create and register the new vector loop.
3391   Loop *Lp = LI->AllocateLoop();
3392   Loop *ParentLoop = OrigLoop->getParentLoop();
3393 
3394   // Insert the new loop into the loop nest and register the new basic blocks
3395   // before calling any utilities such as SCEV that require valid LoopInfo.
3396   if (ParentLoop) {
3397     ParentLoop->addChildLoop(Lp);
3398   } else {
3399     LI->addTopLevelLoop(Lp);
3400   }
3401   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3402   return Lp;
3403 }
3404 
3405 void InnerLoopVectorizer::createInductionResumeValues(
3406     Loop *L, Value *VectorTripCount,
3407     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3408   assert(VectorTripCount && L && "Expected valid arguments");
3409   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3410           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3411          "Inconsistent information about additional bypass.");
3412   // We are going to resume the execution of the scalar loop.
3413   // Go over all of the induction variables that we found and fix the
3414   // PHIs that are left in the scalar version of the loop.
3415   // The starting values of PHI nodes depend on the counter of the last
3416   // iteration in the vectorized loop.
3417   // If we come from a bypass edge then we need to start from the original
3418   // start value.
3419   for (auto &InductionEntry : Legal->getInductionVars()) {
3420     PHINode *OrigPhi = InductionEntry.first;
3421     InductionDescriptor II = InductionEntry.second;
3422 
3423     // Create phi nodes to merge from the  backedge-taken check block.
3424     PHINode *BCResumeVal =
3425         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3426                         LoopScalarPreHeader->getTerminator());
3427     // Copy original phi DL over to the new one.
3428     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3429     Value *&EndValue = IVEndValues[OrigPhi];
3430     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3431     if (OrigPhi == OldInduction) {
3432       // We know what the end value is.
3433       EndValue = VectorTripCount;
3434     } else {
3435       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3436 
3437       // Fast-math-flags propagate from the original induction instruction.
3438       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3439         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3440 
3441       Type *StepType = II.getStep()->getType();
3442       Instruction::CastOps CastOp =
3443           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3444       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3445       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3446       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3447       EndValue->setName("ind.end");
3448 
3449       // Compute the end value for the additional bypass (if applicable).
3450       if (AdditionalBypass.first) {
3451         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3452         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3453                                          StepType, true);
3454         CRD =
3455             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3456         EndValueFromAdditionalBypass =
3457             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3458         EndValueFromAdditionalBypass->setName("ind.end");
3459       }
3460     }
3461     // The new PHI merges the original incoming value, in case of a bypass,
3462     // or the value at the end of the vectorized loop.
3463     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3464 
3465     // Fix the scalar body counter (PHI node).
3466     // The old induction's phi node in the scalar body needs the truncated
3467     // value.
3468     for (BasicBlock *BB : LoopBypassBlocks)
3469       BCResumeVal->addIncoming(II.getStartValue(), BB);
3470 
3471     if (AdditionalBypass.first)
3472       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3473                                             EndValueFromAdditionalBypass);
3474 
3475     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3476   }
3477 }
3478 
3479 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3480                                                       MDNode *OrigLoopID) {
3481   assert(L && "Expected valid loop.");
3482 
3483   // The trip counts should be cached by now.
3484   Value *Count = getOrCreateTripCount(L);
3485   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3486 
3487   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3488 
3489   // Add a check in the middle block to see if we have completed
3490   // all of the iterations in the first vector loop.
3491   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3492   // If tail is to be folded, we know we don't need to run the remainder.
3493   if (!Cost->foldTailByMasking()) {
3494     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3495                                         Count, VectorTripCount, "cmp.n",
3496                                         LoopMiddleBlock->getTerminator());
3497 
3498     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3499     // of the corresponding compare because they may have ended up with
3500     // different line numbers and we want to avoid awkward line stepping while
3501     // debugging. Eg. if the compare has got a line number inside the loop.
3502     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3503     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3504   }
3505 
3506   // Get ready to start creating new instructions into the vectorized body.
3507   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3508          "Inconsistent vector loop preheader");
3509   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3510 
3511   Optional<MDNode *> VectorizedLoopID =
3512       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3513                                       LLVMLoopVectorizeFollowupVectorized});
3514   if (VectorizedLoopID.hasValue()) {
3515     L->setLoopID(VectorizedLoopID.getValue());
3516 
3517     // Do not setAlreadyVectorized if loop attributes have been defined
3518     // explicitly.
3519     return LoopVectorPreHeader;
3520   }
3521 
3522   // Keep all loop hints from the original loop on the vector loop (we'll
3523   // replace the vectorizer-specific hints below).
3524   if (MDNode *LID = OrigLoop->getLoopID())
3525     L->setLoopID(LID);
3526 
3527   LoopVectorizeHints Hints(L, true, *ORE);
3528   Hints.setAlreadyVectorized();
3529 
3530 #ifdef EXPENSIVE_CHECKS
3531   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3532   LI->verify(*DT);
3533 #endif
3534 
3535   return LoopVectorPreHeader;
3536 }
3537 
3538 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3539   /*
3540    In this function we generate a new loop. The new loop will contain
3541    the vectorized instructions while the old loop will continue to run the
3542    scalar remainder.
3543 
3544        [ ] <-- loop iteration number check.
3545     /   |
3546    /    v
3547   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3548   |  /  |
3549   | /   v
3550   ||   [ ]     <-- vector pre header.
3551   |/    |
3552   |     v
3553   |    [  ] \
3554   |    [  ]_|   <-- vector loop.
3555   |     |
3556   |     v
3557   |   -[ ]   <--- middle-block.
3558   |  /  |
3559   | /   v
3560   -|- >[ ]     <--- new preheader.
3561    |    |
3562    |    v
3563    |   [ ] \
3564    |   [ ]_|   <-- old scalar loop to handle remainder.
3565     \   |
3566      \  v
3567       >[ ]     <-- exit block.
3568    ...
3569    */
3570 
3571   // Get the metadata of the original loop before it gets modified.
3572   MDNode *OrigLoopID = OrigLoop->getLoopID();
3573 
3574   // Create an empty vector loop, and prepare basic blocks for the runtime
3575   // checks.
3576   Loop *Lp = createVectorLoopSkeleton("");
3577 
3578   // Now, compare the new count to zero. If it is zero skip the vector loop and
3579   // jump to the scalar loop. This check also covers the case where the
3580   // backedge-taken count is uint##_max: adding one to it will overflow leading
3581   // to an incorrect trip count of zero. In this (rare) case we will also jump
3582   // to the scalar loop.
3583   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3584 
3585   // Generate the code to check any assumptions that we've made for SCEV
3586   // expressions.
3587   emitSCEVChecks(Lp, LoopScalarPreHeader);
3588 
3589   // Generate the code that checks in runtime if arrays overlap. We put the
3590   // checks into a separate block to make the more common case of few elements
3591   // faster.
3592   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3593 
3594   // Some loops have a single integer induction variable, while other loops
3595   // don't. One example is c++ iterators that often have multiple pointer
3596   // induction variables. In the code below we also support a case where we
3597   // don't have a single induction variable.
3598   //
3599   // We try to obtain an induction variable from the original loop as hard
3600   // as possible. However if we don't find one that:
3601   //   - is an integer
3602   //   - counts from zero, stepping by one
3603   //   - is the size of the widest induction variable type
3604   // then we create a new one.
3605   OldInduction = Legal->getPrimaryInduction();
3606   Type *IdxTy = Legal->getWidestInductionType();
3607   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3608   // The loop step is equal to the vectorization factor (num of SIMD elements)
3609   // times the unroll factor (num of SIMD instructions).
3610   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3611   Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3612   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3613   Induction =
3614       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3615                               getDebugLocFromInstOrOperands(OldInduction));
3616 
3617   // Emit phis for the new starting index of the scalar loop.
3618   createInductionResumeValues(Lp, CountRoundDown);
3619 
3620   return completeLoopSkeleton(Lp, OrigLoopID);
3621 }
3622 
3623 // Fix up external users of the induction variable. At this point, we are
3624 // in LCSSA form, with all external PHIs that use the IV having one input value,
3625 // coming from the remainder loop. We need those PHIs to also have a correct
3626 // value for the IV when arriving directly from the middle block.
3627 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3628                                        const InductionDescriptor &II,
3629                                        Value *CountRoundDown, Value *EndValue,
3630                                        BasicBlock *MiddleBlock) {
3631   // There are two kinds of external IV usages - those that use the value
3632   // computed in the last iteration (the PHI) and those that use the penultimate
3633   // value (the value that feeds into the phi from the loop latch).
3634   // We allow both, but they, obviously, have different values.
3635 
3636   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3637 
3638   DenseMap<Value *, Value *> MissingVals;
3639 
3640   // An external user of the last iteration's value should see the value that
3641   // the remainder loop uses to initialize its own IV.
3642   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3643   for (User *U : PostInc->users()) {
3644     Instruction *UI = cast<Instruction>(U);
3645     if (!OrigLoop->contains(UI)) {
3646       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3647       MissingVals[UI] = EndValue;
3648     }
3649   }
3650 
3651   // An external user of the penultimate value need to see EndValue - Step.
3652   // The simplest way to get this is to recompute it from the constituent SCEVs,
3653   // that is Start + (Step * (CRD - 1)).
3654   for (User *U : OrigPhi->users()) {
3655     auto *UI = cast<Instruction>(U);
3656     if (!OrigLoop->contains(UI)) {
3657       const DataLayout &DL =
3658           OrigLoop->getHeader()->getModule()->getDataLayout();
3659       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3660 
3661       IRBuilder<> B(MiddleBlock->getTerminator());
3662 
3663       // Fast-math-flags propagate from the original induction instruction.
3664       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3665         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3666 
3667       Value *CountMinusOne = B.CreateSub(
3668           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3669       Value *CMO =
3670           !II.getStep()->getType()->isIntegerTy()
3671               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3672                              II.getStep()->getType())
3673               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3674       CMO->setName("cast.cmo");
3675       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3676       Escape->setName("ind.escape");
3677       MissingVals[UI] = Escape;
3678     }
3679   }
3680 
3681   for (auto &I : MissingVals) {
3682     PHINode *PHI = cast<PHINode>(I.first);
3683     // One corner case we have to handle is two IVs "chasing" each-other,
3684     // that is %IV2 = phi [...], [ %IV1, %latch ]
3685     // In this case, if IV1 has an external use, we need to avoid adding both
3686     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3687     // don't already have an incoming value for the middle block.
3688     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3689       PHI->addIncoming(I.second, MiddleBlock);
3690   }
3691 }
3692 
3693 namespace {
3694 
3695 struct CSEDenseMapInfo {
3696   static bool canHandle(const Instruction *I) {
3697     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3698            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3699   }
3700 
3701   static inline Instruction *getEmptyKey() {
3702     return DenseMapInfo<Instruction *>::getEmptyKey();
3703   }
3704 
3705   static inline Instruction *getTombstoneKey() {
3706     return DenseMapInfo<Instruction *>::getTombstoneKey();
3707   }
3708 
3709   static unsigned getHashValue(const Instruction *I) {
3710     assert(canHandle(I) && "Unknown instruction!");
3711     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3712                                                            I->value_op_end()));
3713   }
3714 
3715   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3716     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3717         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3718       return LHS == RHS;
3719     return LHS->isIdenticalTo(RHS);
3720   }
3721 };
3722 
3723 } // end anonymous namespace
3724 
3725 ///Perform cse of induction variable instructions.
3726 static void cse(BasicBlock *BB) {
3727   // Perform simple cse.
3728   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3729   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3730     Instruction *In = &*I++;
3731 
3732     if (!CSEDenseMapInfo::canHandle(In))
3733       continue;
3734 
3735     // Check if we can replace this instruction with any of the
3736     // visited instructions.
3737     if (Instruction *V = CSEMap.lookup(In)) {
3738       In->replaceAllUsesWith(V);
3739       In->eraseFromParent();
3740       continue;
3741     }
3742 
3743     CSEMap[In] = In;
3744   }
3745 }
3746 
3747 InstructionCost
3748 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3749                                               bool &NeedToScalarize) {
3750   Function *F = CI->getCalledFunction();
3751   Type *ScalarRetTy = CI->getType();
3752   SmallVector<Type *, 4> Tys, ScalarTys;
3753   for (auto &ArgOp : CI->arg_operands())
3754     ScalarTys.push_back(ArgOp->getType());
3755 
3756   // Estimate cost of scalarized vector call. The source operands are assumed
3757   // to be vectors, so we need to extract individual elements from there,
3758   // execute VF scalar calls, and then gather the result into the vector return
3759   // value.
3760   InstructionCost ScalarCallCost =
3761       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3762   if (VF.isScalar())
3763     return ScalarCallCost;
3764 
3765   // Compute corresponding vector type for return value and arguments.
3766   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3767   for (Type *ScalarTy : ScalarTys)
3768     Tys.push_back(ToVectorTy(ScalarTy, VF));
3769 
3770   // Compute costs of unpacking argument values for the scalar calls and
3771   // packing the return values to a vector.
3772   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3773 
3774   InstructionCost Cost =
3775       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3776 
3777   // If we can't emit a vector call for this function, then the currently found
3778   // cost is the cost we need to return.
3779   NeedToScalarize = true;
3780   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3781   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3782 
3783   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3784     return Cost;
3785 
3786   // If the corresponding vector cost is cheaper, return its cost.
3787   InstructionCost VectorCallCost =
3788       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3789   if (VectorCallCost < Cost) {
3790     NeedToScalarize = false;
3791     Cost = VectorCallCost;
3792   }
3793   return Cost;
3794 }
3795 
3796 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3797   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3798     return Elt;
3799   return VectorType::get(Elt, VF);
3800 }
3801 
3802 InstructionCost
3803 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3804                                                    ElementCount VF) {
3805   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3806   assert(ID && "Expected intrinsic call!");
3807   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3808   FastMathFlags FMF;
3809   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3810     FMF = FPMO->getFastMathFlags();
3811 
3812   SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
3813   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3814   SmallVector<Type *> ParamTys;
3815   std::transform(FTy->param_begin(), FTy->param_end(),
3816                  std::back_inserter(ParamTys),
3817                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3818 
3819   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3820                                     dyn_cast<IntrinsicInst>(CI));
3821   return TTI.getIntrinsicInstrCost(CostAttrs,
3822                                    TargetTransformInfo::TCK_RecipThroughput);
3823 }
3824 
3825 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3826   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3827   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3828   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3829 }
3830 
3831 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3832   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3833   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3834   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3835 }
3836 
3837 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3838   // For every instruction `I` in MinBWs, truncate the operands, create a
3839   // truncated version of `I` and reextend its result. InstCombine runs
3840   // later and will remove any ext/trunc pairs.
3841   SmallPtrSet<Value *, 4> Erased;
3842   for (const auto &KV : Cost->getMinimalBitwidths()) {
3843     // If the value wasn't vectorized, we must maintain the original scalar
3844     // type. The absence of the value from State indicates that it
3845     // wasn't vectorized.
3846     VPValue *Def = State.Plan->getVPValue(KV.first);
3847     if (!State.hasAnyVectorValue(Def))
3848       continue;
3849     for (unsigned Part = 0; Part < UF; ++Part) {
3850       Value *I = State.get(Def, Part);
3851       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3852         continue;
3853       Type *OriginalTy = I->getType();
3854       Type *ScalarTruncatedTy =
3855           IntegerType::get(OriginalTy->getContext(), KV.second);
3856       auto *TruncatedTy = FixedVectorType::get(
3857           ScalarTruncatedTy,
3858           cast<FixedVectorType>(OriginalTy)->getNumElements());
3859       if (TruncatedTy == OriginalTy)
3860         continue;
3861 
3862       IRBuilder<> B(cast<Instruction>(I));
3863       auto ShrinkOperand = [&](Value *V) -> Value * {
3864         if (auto *ZI = dyn_cast<ZExtInst>(V))
3865           if (ZI->getSrcTy() == TruncatedTy)
3866             return ZI->getOperand(0);
3867         return B.CreateZExtOrTrunc(V, TruncatedTy);
3868       };
3869 
3870       // The actual instruction modification depends on the instruction type,
3871       // unfortunately.
3872       Value *NewI = nullptr;
3873       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3874         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3875                              ShrinkOperand(BO->getOperand(1)));
3876 
3877         // Any wrapping introduced by shrinking this operation shouldn't be
3878         // considered undefined behavior. So, we can't unconditionally copy
3879         // arithmetic wrapping flags to NewI.
3880         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3881       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3882         NewI =
3883             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3884                          ShrinkOperand(CI->getOperand(1)));
3885       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3886         NewI = B.CreateSelect(SI->getCondition(),
3887                               ShrinkOperand(SI->getTrueValue()),
3888                               ShrinkOperand(SI->getFalseValue()));
3889       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3890         switch (CI->getOpcode()) {
3891         default:
3892           llvm_unreachable("Unhandled cast!");
3893         case Instruction::Trunc:
3894           NewI = ShrinkOperand(CI->getOperand(0));
3895           break;
3896         case Instruction::SExt:
3897           NewI = B.CreateSExtOrTrunc(
3898               CI->getOperand(0),
3899               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3900           break;
3901         case Instruction::ZExt:
3902           NewI = B.CreateZExtOrTrunc(
3903               CI->getOperand(0),
3904               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3905           break;
3906         }
3907       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3908         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3909                              ->getNumElements();
3910         auto *O0 = B.CreateZExtOrTrunc(
3911             SI->getOperand(0),
3912             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3913         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3914                              ->getNumElements();
3915         auto *O1 = B.CreateZExtOrTrunc(
3916             SI->getOperand(1),
3917             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3918 
3919         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3920       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3921         // Don't do anything with the operands, just extend the result.
3922         continue;
3923       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3924         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
3925                             ->getNumElements();
3926         auto *O0 = B.CreateZExtOrTrunc(
3927             IE->getOperand(0),
3928             FixedVectorType::get(ScalarTruncatedTy, Elements));
3929         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3930         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3931       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3932         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
3933                             ->getNumElements();
3934         auto *O0 = B.CreateZExtOrTrunc(
3935             EE->getOperand(0),
3936             FixedVectorType::get(ScalarTruncatedTy, Elements));
3937         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3938       } else {
3939         // If we don't know what to do, be conservative and don't do anything.
3940         continue;
3941       }
3942 
3943       // Lastly, extend the result.
3944       NewI->takeName(cast<Instruction>(I));
3945       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3946       I->replaceAllUsesWith(Res);
3947       cast<Instruction>(I)->eraseFromParent();
3948       Erased.insert(I);
3949       State.reset(Def, Res, Part);
3950     }
3951   }
3952 
3953   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3954   for (const auto &KV : Cost->getMinimalBitwidths()) {
3955     // If the value wasn't vectorized, we must maintain the original scalar
3956     // type. The absence of the value from State indicates that it
3957     // wasn't vectorized.
3958     VPValue *Def = State.Plan->getVPValue(KV.first);
3959     if (!State.hasAnyVectorValue(Def))
3960       continue;
3961     for (unsigned Part = 0; Part < UF; ++Part) {
3962       Value *I = State.get(Def, Part);
3963       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3964       if (Inst && Inst->use_empty()) {
3965         Value *NewI = Inst->getOperand(0);
3966         Inst->eraseFromParent();
3967         State.reset(Def, NewI, Part);
3968       }
3969     }
3970   }
3971 }
3972 
3973 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
3974   // Insert truncates and extends for any truncated instructions as hints to
3975   // InstCombine.
3976   if (VF.isVector())
3977     truncateToMinimalBitwidths(State);
3978 
3979   // Fix widened non-induction PHIs by setting up the PHI operands.
3980   if (OrigPHIsToFix.size()) {
3981     assert(EnableVPlanNativePath &&
3982            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3983     fixNonInductionPHIs(State);
3984   }
3985 
3986   // At this point every instruction in the original loop is widened to a
3987   // vector form. Now we need to fix the recurrences in the loop. These PHI
3988   // nodes are currently empty because we did not want to introduce cycles.
3989   // This is the second stage of vectorizing recurrences.
3990   fixCrossIterationPHIs(State);
3991 
3992   // Forget the original basic block.
3993   PSE.getSE()->forgetLoop(OrigLoop);
3994 
3995   // Fix-up external users of the induction variables.
3996   for (auto &Entry : Legal->getInductionVars())
3997     fixupIVUsers(Entry.first, Entry.second,
3998                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3999                  IVEndValues[Entry.first], LoopMiddleBlock);
4000 
4001   fixLCSSAPHIs(State);
4002   for (Instruction *PI : PredicatedInstructions)
4003     sinkScalarOperands(&*PI);
4004 
4005   // Remove redundant induction instructions.
4006   cse(LoopVectorBody);
4007 
4008   // Set/update profile weights for the vector and remainder loops as original
4009   // loop iterations are now distributed among them. Note that original loop
4010   // represented by LoopScalarBody becomes remainder loop after vectorization.
4011   //
4012   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4013   // end up getting slightly roughened result but that should be OK since
4014   // profile is not inherently precise anyway. Note also possible bypass of
4015   // vector code caused by legality checks is ignored, assigning all the weight
4016   // to the vector loop, optimistically.
4017   //
4018   // For scalable vectorization we can't know at compile time how many iterations
4019   // of the loop are handled in one vector iteration, so instead assume a pessimistic
4020   // vscale of '1'.
4021   setProfileInfoAfterUnrolling(
4022       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4023       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4024 }
4025 
4026 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4027   // In order to support recurrences we need to be able to vectorize Phi nodes.
4028   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4029   // stage #2: We now need to fix the recurrences by adding incoming edges to
4030   // the currently empty PHI nodes. At this point every instruction in the
4031   // original loop is widened to a vector form so we can use them to construct
4032   // the incoming edges.
4033   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
4034     // Handle first-order recurrences and reductions that need to be fixed.
4035     if (Legal->isFirstOrderRecurrence(&Phi))
4036       fixFirstOrderRecurrence(&Phi, State);
4037     else if (Legal->isReductionVariable(&Phi))
4038       fixReduction(&Phi, State);
4039   }
4040 }
4041 
4042 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi,
4043                                                   VPTransformState &State) {
4044   // This is the second phase of vectorizing first-order recurrences. An
4045   // overview of the transformation is described below. Suppose we have the
4046   // following loop.
4047   //
4048   //   for (int i = 0; i < n; ++i)
4049   //     b[i] = a[i] - a[i - 1];
4050   //
4051   // There is a first-order recurrence on "a". For this loop, the shorthand
4052   // scalar IR looks like:
4053   //
4054   //   scalar.ph:
4055   //     s_init = a[-1]
4056   //     br scalar.body
4057   //
4058   //   scalar.body:
4059   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4060   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4061   //     s2 = a[i]
4062   //     b[i] = s2 - s1
4063   //     br cond, scalar.body, ...
4064   //
4065   // In this example, s1 is a recurrence because it's value depends on the
4066   // previous iteration. In the first phase of vectorization, we created a
4067   // temporary value for s1. We now complete the vectorization and produce the
4068   // shorthand vector IR shown below (for VF = 4, UF = 1).
4069   //
4070   //   vector.ph:
4071   //     v_init = vector(..., ..., ..., a[-1])
4072   //     br vector.body
4073   //
4074   //   vector.body
4075   //     i = phi [0, vector.ph], [i+4, vector.body]
4076   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4077   //     v2 = a[i, i+1, i+2, i+3];
4078   //     v3 = vector(v1(3), v2(0, 1, 2))
4079   //     b[i, i+1, i+2, i+3] = v2 - v3
4080   //     br cond, vector.body, middle.block
4081   //
4082   //   middle.block:
4083   //     x = v2(3)
4084   //     br scalar.ph
4085   //
4086   //   scalar.ph:
4087   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4088   //     br scalar.body
4089   //
4090   // After execution completes the vector loop, we extract the next value of
4091   // the recurrence (x) to use as the initial value in the scalar loop.
4092 
4093   // Get the original loop preheader and single loop latch.
4094   auto *Preheader = OrigLoop->getLoopPreheader();
4095   auto *Latch = OrigLoop->getLoopLatch();
4096 
4097   // Get the initial and previous values of the scalar recurrence.
4098   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4099   auto *Previous = Phi->getIncomingValueForBlock(Latch);
4100 
4101   // Create a vector from the initial value.
4102   auto *VectorInit = ScalarInit;
4103   if (VF.isVector()) {
4104     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4105     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
4106     VectorInit = Builder.CreateInsertElement(
4107         PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
4108         Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
4109   }
4110 
4111   VPValue *PhiDef = State.Plan->getVPValue(Phi);
4112   VPValue *PreviousDef = State.Plan->getVPValue(Previous);
4113   // We constructed a temporary phi node in the first phase of vectorization.
4114   // This phi node will eventually be deleted.
4115   Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0)));
4116 
4117   // Create a phi node for the new recurrence. The current value will either be
4118   // the initial value inserted into a vector or loop-varying vector value.
4119   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4120   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4121 
4122   // Get the vectorized previous value of the last part UF - 1. It appears last
4123   // among all unrolled iterations, due to the order of their construction.
4124   Value *PreviousLastPart = State.get(PreviousDef, UF - 1);
4125 
4126   // Find and set the insertion point after the previous value if it is an
4127   // instruction.
4128   BasicBlock::iterator InsertPt;
4129   // Note that the previous value may have been constant-folded so it is not
4130   // guaranteed to be an instruction in the vector loop.
4131   // FIXME: Loop invariant values do not form recurrences. We should deal with
4132   //        them earlier.
4133   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4134     InsertPt = LoopVectorBody->getFirstInsertionPt();
4135   else {
4136     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4137     if (isa<PHINode>(PreviousLastPart))
4138       // If the previous value is a phi node, we should insert after all the phi
4139       // nodes in the block containing the PHI to avoid breaking basic block
4140       // verification. Note that the basic block may be different to
4141       // LoopVectorBody, in case we predicate the loop.
4142       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4143     else
4144       InsertPt = ++PreviousInst->getIterator();
4145   }
4146   Builder.SetInsertPoint(&*InsertPt);
4147 
4148   // We will construct a vector for the recurrence by combining the values for
4149   // the current and previous iterations. This is the required shuffle mask.
4150   assert(!VF.isScalable());
4151   SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
4152   ShuffleMask[0] = VF.getKnownMinValue() - 1;
4153   for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
4154     ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
4155 
4156   // The vector from which to take the initial value for the current iteration
4157   // (actual or unrolled). Initially, this is the vector phi node.
4158   Value *Incoming = VecPhi;
4159 
4160   // Shuffle the current and previous vector and update the vector parts.
4161   for (unsigned Part = 0; Part < UF; ++Part) {
4162     Value *PreviousPart = State.get(PreviousDef, Part);
4163     Value *PhiPart = State.get(PhiDef, Part);
4164     auto *Shuffle =
4165         VF.isVector()
4166             ? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
4167             : Incoming;
4168     PhiPart->replaceAllUsesWith(Shuffle);
4169     cast<Instruction>(PhiPart)->eraseFromParent();
4170     State.reset(PhiDef, Shuffle, Part);
4171     Incoming = PreviousPart;
4172   }
4173 
4174   // Fix the latch value of the new recurrence in the vector loop.
4175   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4176 
4177   // Extract the last vector element in the middle block. This will be the
4178   // initial value for the recurrence when jumping to the scalar loop.
4179   auto *ExtractForScalar = Incoming;
4180   if (VF.isVector()) {
4181     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4182     ExtractForScalar = Builder.CreateExtractElement(
4183         ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
4184         "vector.recur.extract");
4185   }
4186   // Extract the second last element in the middle block if the
4187   // Phi is used outside the loop. We need to extract the phi itself
4188   // and not the last element (the phi update in the current iteration). This
4189   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4190   // when the scalar loop is not run at all.
4191   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4192   if (VF.isVector())
4193     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4194         Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
4195         "vector.recur.extract.for.phi");
4196   // When loop is unrolled without vectorizing, initialize
4197   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
4198   // `Incoming`. This is analogous to the vectorized case above: extracting the
4199   // second last element when VF > 1.
4200   else if (UF > 1)
4201     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4202 
4203   // Fix the initial value of the original recurrence in the scalar loop.
4204   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4205   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4206   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4207     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4208     Start->addIncoming(Incoming, BB);
4209   }
4210 
4211   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4212   Phi->setName("scalar.recur");
4213 
4214   // Finally, fix users of the recurrence outside the loop. The users will need
4215   // either the last value of the scalar recurrence or the last value of the
4216   // vector recurrence we extracted in the middle block. Since the loop is in
4217   // LCSSA form, we just need to find all the phi nodes for the original scalar
4218   // recurrence in the exit block, and then add an edge for the middle block.
4219   // Note that LCSSA does not imply single entry when the original scalar loop
4220   // had multiple exiting edges (as we always run the last iteration in the
4221   // scalar epilogue); in that case, the exiting path through middle will be
4222   // dynamically dead and the value picked for the phi doesn't matter.
4223   for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4224     if (any_of(LCSSAPhi.incoming_values(),
4225                [Phi](Value *V) { return V == Phi; }))
4226       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4227 }
4228 
4229 void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) {
4230   // Get it's reduction variable descriptor.
4231   assert(Legal->isReductionVariable(Phi) &&
4232          "Unable to find the reduction variable");
4233   RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4234 
4235   RecurKind RK = RdxDesc.getRecurrenceKind();
4236   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4237   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4238   setDebugLocFromInst(Builder, ReductionStartValue);
4239   bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
4240 
4241   VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst);
4242   // This is the vector-clone of the value that leaves the loop.
4243   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4244 
4245   // Wrap flags are in general invalid after vectorization, clear them.
4246   clearReductionWrapFlags(RdxDesc, State);
4247 
4248   // Fix the vector-loop phi.
4249 
4250   // Reductions do not have to start at zero. They can start with
4251   // any loop invariant values.
4252   BasicBlock *Latch = OrigLoop->getLoopLatch();
4253   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
4254 
4255   for (unsigned Part = 0; Part < UF; ++Part) {
4256     Value *VecRdxPhi = State.get(State.Plan->getVPValue(Phi), Part);
4257     Value *Val = State.get(State.Plan->getVPValue(LoopVal), Part);
4258     cast<PHINode>(VecRdxPhi)
4259       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4260   }
4261 
4262   // Before each round, move the insertion point right between
4263   // the PHIs and the values we are going to write.
4264   // This allows us to write both PHINodes and the extractelement
4265   // instructions.
4266   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4267 
4268   setDebugLocFromInst(Builder, LoopExitInst);
4269 
4270   // If tail is folded by masking, the vector value to leave the loop should be
4271   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4272   // instead of the former. For an inloop reduction the reduction will already
4273   // be predicated, and does not need to be handled here.
4274   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4275     for (unsigned Part = 0; Part < UF; ++Part) {
4276       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4277       Value *Sel = nullptr;
4278       for (User *U : VecLoopExitInst->users()) {
4279         if (isa<SelectInst>(U)) {
4280           assert(!Sel && "Reduction exit feeding two selects");
4281           Sel = U;
4282         } else
4283           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4284       }
4285       assert(Sel && "Reduction exit feeds no select");
4286       State.reset(LoopExitInstDef, Sel, Part);
4287 
4288       // If the target can create a predicated operator for the reduction at no
4289       // extra cost in the loop (for example a predicated vadd), it can be
4290       // cheaper for the select to remain in the loop than be sunk out of it,
4291       // and so use the select value for the phi instead of the old
4292       // LoopExitValue.
4293       RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
4294       if (PreferPredicatedReductionSelect ||
4295           TTI->preferPredicatedReductionSelect(
4296               RdxDesc.getOpcode(), Phi->getType(),
4297               TargetTransformInfo::ReductionFlags())) {
4298         auto *VecRdxPhi =
4299             cast<PHINode>(State.get(State.Plan->getVPValue(Phi), Part));
4300         VecRdxPhi->setIncomingValueForBlock(
4301             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4302       }
4303     }
4304   }
4305 
4306   // If the vector reduction can be performed in a smaller type, we truncate
4307   // then extend the loop exit value to enable InstCombine to evaluate the
4308   // entire expression in the smaller type.
4309   if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
4310     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4311     assert(!VF.isScalable() && "scalable vectors not yet supported.");
4312     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4313     Builder.SetInsertPoint(
4314         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4315     VectorParts RdxParts(UF);
4316     for (unsigned Part = 0; Part < UF; ++Part) {
4317       RdxParts[Part] = State.get(LoopExitInstDef, Part);
4318       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4319       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4320                                         : Builder.CreateZExt(Trunc, VecTy);
4321       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4322            UI != RdxParts[Part]->user_end();)
4323         if (*UI != Trunc) {
4324           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4325           RdxParts[Part] = Extnd;
4326         } else {
4327           ++UI;
4328         }
4329     }
4330     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4331     for (unsigned Part = 0; Part < UF; ++Part) {
4332       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4333       State.reset(LoopExitInstDef, RdxParts[Part], Part);
4334     }
4335   }
4336 
4337   // Reduce all of the unrolled parts into a single vector.
4338   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4339   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4340 
4341   // The middle block terminator has already been assigned a DebugLoc here (the
4342   // OrigLoop's single latch terminator). We want the whole middle block to
4343   // appear to execute on this line because: (a) it is all compiler generated,
4344   // (b) these instructions are always executed after evaluating the latch
4345   // conditional branch, and (c) other passes may add new predecessors which
4346   // terminate on this line. This is the easiest way to ensure we don't
4347   // accidentally cause an extra step back into the loop while debugging.
4348   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4349   {
4350     // Floating-point operations should have some FMF to enable the reduction.
4351     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4352     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4353     for (unsigned Part = 1; Part < UF; ++Part) {
4354       Value *RdxPart = State.get(LoopExitInstDef, Part);
4355       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4356         ReducedPartRdx = Builder.CreateBinOp(
4357             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4358       } else {
4359         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4360       }
4361     }
4362   }
4363 
4364   // Create the reduction after the loop. Note that inloop reductions create the
4365   // target reduction in the loop using a Reduction recipe.
4366   if (VF.isVector() && !IsInLoopReductionPhi) {
4367     ReducedPartRdx =
4368         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4369     // If the reduction can be performed in a smaller type, we need to extend
4370     // the reduction to the wider type before we branch to the original loop.
4371     if (Phi->getType() != RdxDesc.getRecurrenceType())
4372       ReducedPartRdx =
4373         RdxDesc.isSigned()
4374         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
4375         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
4376   }
4377 
4378   // Create a phi node that merges control-flow from the backedge-taken check
4379   // block and the middle block.
4380   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
4381                                         LoopScalarPreHeader->getTerminator());
4382   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4383     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4384   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4385 
4386   // Now, we need to fix the users of the reduction variable
4387   // inside and outside of the scalar remainder loop.
4388 
4389   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4390   // in the exit blocks.  See comment on analogous loop in
4391   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4392   for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4393     if (any_of(LCSSAPhi.incoming_values(),
4394                [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4395       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4396 
4397   // Fix the scalar loop reduction variable with the incoming reduction sum
4398   // from the vector body and from the backedge value.
4399   int IncomingEdgeBlockIdx =
4400     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4401   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4402   // Pick the other block.
4403   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4404   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4405   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4406 }
4407 
4408 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc,
4409                                                   VPTransformState &State) {
4410   RecurKind RK = RdxDesc.getRecurrenceKind();
4411   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4412     return;
4413 
4414   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4415   assert(LoopExitInstr && "null loop exit instruction");
4416   SmallVector<Instruction *, 8> Worklist;
4417   SmallPtrSet<Instruction *, 8> Visited;
4418   Worklist.push_back(LoopExitInstr);
4419   Visited.insert(LoopExitInstr);
4420 
4421   while (!Worklist.empty()) {
4422     Instruction *Cur = Worklist.pop_back_val();
4423     if (isa<OverflowingBinaryOperator>(Cur))
4424       for (unsigned Part = 0; Part < UF; ++Part) {
4425         Value *V = State.get(State.Plan->getVPValue(Cur), Part);
4426         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4427       }
4428 
4429     for (User *U : Cur->users()) {
4430       Instruction *UI = cast<Instruction>(U);
4431       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4432           Visited.insert(UI).second)
4433         Worklist.push_back(UI);
4434     }
4435   }
4436 }
4437 
4438 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4439   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4440     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4441       // Some phis were already hand updated by the reduction and recurrence
4442       // code above, leave them alone.
4443       continue;
4444 
4445     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4446     // Non-instruction incoming values will have only one value.
4447 
4448     VPLane Lane = VPLane::getFirstLane();
4449     if (isa<Instruction>(IncomingValue) &&
4450         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4451                                            VF))
4452       Lane = VPLane::getLastLaneForVF(VF);
4453 
4454     // Can be a loop invariant incoming value or the last scalar value to be
4455     // extracted from the vectorized loop.
4456     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4457     Value *lastIncomingValue =
4458         OrigLoop->isLoopInvariant(IncomingValue)
4459             ? IncomingValue
4460             : State.get(State.Plan->getVPValue(IncomingValue),
4461                         VPIteration(UF - 1, Lane));
4462     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4463   }
4464 }
4465 
4466 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4467   // The basic block and loop containing the predicated instruction.
4468   auto *PredBB = PredInst->getParent();
4469   auto *VectorLoop = LI->getLoopFor(PredBB);
4470 
4471   // Initialize a worklist with the operands of the predicated instruction.
4472   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4473 
4474   // Holds instructions that we need to analyze again. An instruction may be
4475   // reanalyzed if we don't yet know if we can sink it or not.
4476   SmallVector<Instruction *, 8> InstsToReanalyze;
4477 
4478   // Returns true if a given use occurs in the predicated block. Phi nodes use
4479   // their operands in their corresponding predecessor blocks.
4480   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4481     auto *I = cast<Instruction>(U.getUser());
4482     BasicBlock *BB = I->getParent();
4483     if (auto *Phi = dyn_cast<PHINode>(I))
4484       BB = Phi->getIncomingBlock(
4485           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4486     return BB == PredBB;
4487   };
4488 
4489   // Iteratively sink the scalarized operands of the predicated instruction
4490   // into the block we created for it. When an instruction is sunk, it's
4491   // operands are then added to the worklist. The algorithm ends after one pass
4492   // through the worklist doesn't sink a single instruction.
4493   bool Changed;
4494   do {
4495     // Add the instructions that need to be reanalyzed to the worklist, and
4496     // reset the changed indicator.
4497     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4498     InstsToReanalyze.clear();
4499     Changed = false;
4500 
4501     while (!Worklist.empty()) {
4502       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4503 
4504       // We can't sink an instruction if it is a phi node, is already in the
4505       // predicated block, is not in the loop, or may have side effects.
4506       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
4507           !VectorLoop->contains(I) || I->mayHaveSideEffects())
4508         continue;
4509 
4510       // It's legal to sink the instruction if all its uses occur in the
4511       // predicated block. Otherwise, there's nothing to do yet, and we may
4512       // need to reanalyze the instruction.
4513       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4514         InstsToReanalyze.push_back(I);
4515         continue;
4516       }
4517 
4518       // Move the instruction to the beginning of the predicated block, and add
4519       // it's operands to the worklist.
4520       I->moveBefore(&*PredBB->getFirstInsertionPt());
4521       Worklist.insert(I->op_begin(), I->op_end());
4522 
4523       // The sinking may have enabled other instructions to be sunk, so we will
4524       // need to iterate.
4525       Changed = true;
4526     }
4527   } while (Changed);
4528 }
4529 
4530 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4531   for (PHINode *OrigPhi : OrigPHIsToFix) {
4532     VPWidenPHIRecipe *VPPhi =
4533         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4534     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4535     // Make sure the builder has a valid insert point.
4536     Builder.SetInsertPoint(NewPhi);
4537     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4538       VPValue *Inc = VPPhi->getIncomingValue(i);
4539       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4540       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4541     }
4542   }
4543 }
4544 
4545 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4546                                    VPUser &Operands, unsigned UF,
4547                                    ElementCount VF, bool IsPtrLoopInvariant,
4548                                    SmallBitVector &IsIndexLoopInvariant,
4549                                    VPTransformState &State) {
4550   // Construct a vector GEP by widening the operands of the scalar GEP as
4551   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4552   // results in a vector of pointers when at least one operand of the GEP
4553   // is vector-typed. Thus, to keep the representation compact, we only use
4554   // vector-typed operands for loop-varying values.
4555 
4556   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4557     // If we are vectorizing, but the GEP has only loop-invariant operands,
4558     // the GEP we build (by only using vector-typed operands for
4559     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4560     // produce a vector of pointers, we need to either arbitrarily pick an
4561     // operand to broadcast, or broadcast a clone of the original GEP.
4562     // Here, we broadcast a clone of the original.
4563     //
4564     // TODO: If at some point we decide to scalarize instructions having
4565     //       loop-invariant operands, this special case will no longer be
4566     //       required. We would add the scalarization decision to
4567     //       collectLoopScalars() and teach getVectorValue() to broadcast
4568     //       the lane-zero scalar value.
4569     auto *Clone = Builder.Insert(GEP->clone());
4570     for (unsigned Part = 0; Part < UF; ++Part) {
4571       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4572       State.set(VPDef, EntryPart, Part);
4573       addMetadata(EntryPart, GEP);
4574     }
4575   } else {
4576     // If the GEP has at least one loop-varying operand, we are sure to
4577     // produce a vector of pointers. But if we are only unrolling, we want
4578     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4579     // produce with the code below will be scalar (if VF == 1) or vector
4580     // (otherwise). Note that for the unroll-only case, we still maintain
4581     // values in the vector mapping with initVector, as we do for other
4582     // instructions.
4583     for (unsigned Part = 0; Part < UF; ++Part) {
4584       // The pointer operand of the new GEP. If it's loop-invariant, we
4585       // won't broadcast it.
4586       auto *Ptr = IsPtrLoopInvariant
4587                       ? State.get(Operands.getOperand(0), VPIteration(0, 0))
4588                       : State.get(Operands.getOperand(0), Part);
4589 
4590       // Collect all the indices for the new GEP. If any index is
4591       // loop-invariant, we won't broadcast it.
4592       SmallVector<Value *, 4> Indices;
4593       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4594         VPValue *Operand = Operands.getOperand(I);
4595         if (IsIndexLoopInvariant[I - 1])
4596           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
4597         else
4598           Indices.push_back(State.get(Operand, Part));
4599       }
4600 
4601       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4602       // but it should be a vector, otherwise.
4603       auto *NewGEP =
4604           GEP->isInBounds()
4605               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4606                                           Indices)
4607               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4608       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4609              "NewGEP is not a pointer vector");
4610       State.set(VPDef, NewGEP, Part);
4611       addMetadata(NewGEP, GEP);
4612     }
4613   }
4614 }
4615 
4616 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4617                                               RecurrenceDescriptor *RdxDesc,
4618                                               VPValue *StartVPV, VPValue *Def,
4619                                               VPTransformState &State) {
4620   PHINode *P = cast<PHINode>(PN);
4621   if (EnableVPlanNativePath) {
4622     // Currently we enter here in the VPlan-native path for non-induction
4623     // PHIs where all control flow is uniform. We simply widen these PHIs.
4624     // Create a vector phi with no operands - the vector phi operands will be
4625     // set at the end of vector code generation.
4626     Type *VecTy = (State.VF.isScalar())
4627                       ? PN->getType()
4628                       : VectorType::get(PN->getType(), State.VF);
4629     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4630     State.set(Def, VecPhi, 0);
4631     OrigPHIsToFix.push_back(P);
4632 
4633     return;
4634   }
4635 
4636   assert(PN->getParent() == OrigLoop->getHeader() &&
4637          "Non-header phis should have been handled elsewhere");
4638 
4639   Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr;
4640   // In order to support recurrences we need to be able to vectorize Phi nodes.
4641   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4642   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4643   // this value when we vectorize all of the instructions that use the PHI.
4644   if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
4645     Value *Iden = nullptr;
4646     bool ScalarPHI =
4647         (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4648     Type *VecTy =
4649         ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
4650 
4651     if (RdxDesc) {
4652       assert(Legal->isReductionVariable(P) && StartV &&
4653              "RdxDesc should only be set for reduction variables; in that case "
4654              "a StartV is also required");
4655       RecurKind RK = RdxDesc->getRecurrenceKind();
4656       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
4657         // MinMax reduction have the start value as their identify.
4658         if (ScalarPHI) {
4659           Iden = StartV;
4660         } else {
4661           IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4662           Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4663           StartV = Iden =
4664               Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
4665         }
4666       } else {
4667         Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
4668             RK, VecTy->getScalarType());
4669         Iden = IdenC;
4670 
4671         if (!ScalarPHI) {
4672           Iden = ConstantVector::getSplat(State.VF, IdenC);
4673           IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4674           Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4675           Constant *Zero = Builder.getInt32(0);
4676           StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
4677         }
4678       }
4679     }
4680 
4681     for (unsigned Part = 0; Part < State.UF; ++Part) {
4682       // This is phase one of vectorizing PHIs.
4683       Value *EntryPart = PHINode::Create(
4684           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4685       State.set(Def, EntryPart, Part);
4686       if (StartV) {
4687         // Make sure to add the reduction start value only to the
4688         // first unroll part.
4689         Value *StartVal = (Part == 0) ? StartV : Iden;
4690         cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
4691       }
4692     }
4693     return;
4694   }
4695 
4696   assert(!Legal->isReductionVariable(P) &&
4697          "reductions should be handled above");
4698 
4699   setDebugLocFromInst(Builder, P);
4700 
4701   // This PHINode must be an induction variable.
4702   // Make sure that we know about it.
4703   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4704 
4705   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4706   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4707 
4708   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4709   // which can be found from the original scalar operations.
4710   switch (II.getKind()) {
4711   case InductionDescriptor::IK_NoInduction:
4712     llvm_unreachable("Unknown induction");
4713   case InductionDescriptor::IK_IntInduction:
4714   case InductionDescriptor::IK_FpInduction:
4715     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4716   case InductionDescriptor::IK_PtrInduction: {
4717     // Handle the pointer induction variable case.
4718     assert(P->getType()->isPointerTy() && "Unexpected type.");
4719 
4720     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4721       // This is the normalized GEP that starts counting at zero.
4722       Value *PtrInd =
4723           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4724       // Determine the number of scalars we need to generate for each unroll
4725       // iteration. If the instruction is uniform, we only need to generate the
4726       // first lane. Otherwise, we generate all VF values.
4727       unsigned Lanes = Cost->isUniformAfterVectorization(P, State.VF)
4728                            ? 1
4729                            : State.VF.getKnownMinValue();
4730       for (unsigned Part = 0; Part < UF; ++Part) {
4731         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4732           Constant *Idx = ConstantInt::get(
4733               PtrInd->getType(), Lane + Part * State.VF.getKnownMinValue());
4734           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4735           Value *SclrGep =
4736               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4737           SclrGep->setName("next.gep");
4738           State.set(Def, SclrGep, VPIteration(Part, Lane));
4739         }
4740       }
4741       return;
4742     }
4743     assert(isa<SCEVConstant>(II.getStep()) &&
4744            "Induction step not a SCEV constant!");
4745     Type *PhiType = II.getStep()->getType();
4746 
4747     // Build a pointer phi
4748     Value *ScalarStartValue = II.getStartValue();
4749     Type *ScStValueType = ScalarStartValue->getType();
4750     PHINode *NewPointerPhi =
4751         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4752     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4753 
4754     // A pointer induction, performed by using a gep
4755     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4756     Instruction *InductionLoc = LoopLatch->getTerminator();
4757     const SCEV *ScalarStep = II.getStep();
4758     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4759     Value *ScalarStepValue =
4760         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4761     Value *InductionGEP = GetElementPtrInst::Create(
4762         ScStValueType->getPointerElementType(), NewPointerPhi,
4763         Builder.CreateMul(
4764             ScalarStepValue,
4765             ConstantInt::get(PhiType, State.VF.getKnownMinValue() * State.UF)),
4766         "ptr.ind", InductionLoc);
4767     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4768 
4769     // Create UF many actual address geps that use the pointer
4770     // phi as base and a vectorized version of the step value
4771     // (<step*0, ..., step*N>) as offset.
4772     for (unsigned Part = 0; Part < State.UF; ++Part) {
4773       SmallVector<Constant *, 8> Indices;
4774       // Create a vector of consecutive numbers from zero to VF.
4775       for (unsigned i = 0; i < State.VF.getKnownMinValue(); ++i)
4776         Indices.push_back(
4777             ConstantInt::get(PhiType, i + Part * State.VF.getKnownMinValue()));
4778       Constant *StartOffset = ConstantVector::get(Indices);
4779 
4780       Value *GEP = Builder.CreateGEP(
4781           ScStValueType->getPointerElementType(), NewPointerPhi,
4782           Builder.CreateMul(StartOffset,
4783                             Builder.CreateVectorSplat(
4784                                 State.VF.getKnownMinValue(), ScalarStepValue),
4785                             "vector.gep"));
4786       State.set(Def, GEP, Part);
4787     }
4788   }
4789   }
4790 }
4791 
4792 /// A helper function for checking whether an integer division-related
4793 /// instruction may divide by zero (in which case it must be predicated if
4794 /// executed conditionally in the scalar code).
4795 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4796 /// Non-zero divisors that are non compile-time constants will not be
4797 /// converted into multiplication, so we will still end up scalarizing
4798 /// the division, but can do so w/o predication.
4799 static bool mayDivideByZero(Instruction &I) {
4800   assert((I.getOpcode() == Instruction::UDiv ||
4801           I.getOpcode() == Instruction::SDiv ||
4802           I.getOpcode() == Instruction::URem ||
4803           I.getOpcode() == Instruction::SRem) &&
4804          "Unexpected instruction");
4805   Value *Divisor = I.getOperand(1);
4806   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4807   return !CInt || CInt->isZero();
4808 }
4809 
4810 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4811                                            VPUser &User,
4812                                            VPTransformState &State) {
4813   switch (I.getOpcode()) {
4814   case Instruction::Call:
4815   case Instruction::Br:
4816   case Instruction::PHI:
4817   case Instruction::GetElementPtr:
4818   case Instruction::Select:
4819     llvm_unreachable("This instruction is handled by a different recipe.");
4820   case Instruction::UDiv:
4821   case Instruction::SDiv:
4822   case Instruction::SRem:
4823   case Instruction::URem:
4824   case Instruction::Add:
4825   case Instruction::FAdd:
4826   case Instruction::Sub:
4827   case Instruction::FSub:
4828   case Instruction::FNeg:
4829   case Instruction::Mul:
4830   case Instruction::FMul:
4831   case Instruction::FDiv:
4832   case Instruction::FRem:
4833   case Instruction::Shl:
4834   case Instruction::LShr:
4835   case Instruction::AShr:
4836   case Instruction::And:
4837   case Instruction::Or:
4838   case Instruction::Xor: {
4839     // Just widen unops and binops.
4840     setDebugLocFromInst(Builder, &I);
4841 
4842     for (unsigned Part = 0; Part < UF; ++Part) {
4843       SmallVector<Value *, 2> Ops;
4844       for (VPValue *VPOp : User.operands())
4845         Ops.push_back(State.get(VPOp, Part));
4846 
4847       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4848 
4849       if (auto *VecOp = dyn_cast<Instruction>(V))
4850         VecOp->copyIRFlags(&I);
4851 
4852       // Use this vector value for all users of the original instruction.
4853       State.set(Def, V, Part);
4854       addMetadata(V, &I);
4855     }
4856 
4857     break;
4858   }
4859   case Instruction::ICmp:
4860   case Instruction::FCmp: {
4861     // Widen compares. Generate vector compares.
4862     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4863     auto *Cmp = cast<CmpInst>(&I);
4864     setDebugLocFromInst(Builder, Cmp);
4865     for (unsigned Part = 0; Part < UF; ++Part) {
4866       Value *A = State.get(User.getOperand(0), Part);
4867       Value *B = State.get(User.getOperand(1), Part);
4868       Value *C = nullptr;
4869       if (FCmp) {
4870         // Propagate fast math flags.
4871         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4872         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4873         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4874       } else {
4875         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4876       }
4877       State.set(Def, C, Part);
4878       addMetadata(C, &I);
4879     }
4880 
4881     break;
4882   }
4883 
4884   case Instruction::ZExt:
4885   case Instruction::SExt:
4886   case Instruction::FPToUI:
4887   case Instruction::FPToSI:
4888   case Instruction::FPExt:
4889   case Instruction::PtrToInt:
4890   case Instruction::IntToPtr:
4891   case Instruction::SIToFP:
4892   case Instruction::UIToFP:
4893   case Instruction::Trunc:
4894   case Instruction::FPTrunc:
4895   case Instruction::BitCast: {
4896     auto *CI = cast<CastInst>(&I);
4897     setDebugLocFromInst(Builder, CI);
4898 
4899     /// Vectorize casts.
4900     Type *DestTy =
4901         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4902 
4903     for (unsigned Part = 0; Part < UF; ++Part) {
4904       Value *A = State.get(User.getOperand(0), Part);
4905       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4906       State.set(Def, Cast, Part);
4907       addMetadata(Cast, &I);
4908     }
4909     break;
4910   }
4911   default:
4912     // This instruction is not vectorized by simple widening.
4913     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4914     llvm_unreachable("Unhandled instruction!");
4915   } // end of switch.
4916 }
4917 
4918 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4919                                                VPUser &ArgOperands,
4920                                                VPTransformState &State) {
4921   assert(!isa<DbgInfoIntrinsic>(I) &&
4922          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4923   setDebugLocFromInst(Builder, &I);
4924 
4925   Module *M = I.getParent()->getParent()->getParent();
4926   auto *CI = cast<CallInst>(&I);
4927 
4928   SmallVector<Type *, 4> Tys;
4929   for (Value *ArgOperand : CI->arg_operands())
4930     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4931 
4932   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4933 
4934   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4935   // version of the instruction.
4936   // Is it beneficial to perform intrinsic call compared to lib call?
4937   bool NeedToScalarize = false;
4938   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4939   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4940   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4941   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4942          "Instruction should be scalarized elsewhere.");
4943   assert(IntrinsicCost.isValid() && CallCost.isValid() &&
4944          "Cannot have invalid costs while widening");
4945 
4946   for (unsigned Part = 0; Part < UF; ++Part) {
4947     SmallVector<Value *, 4> Args;
4948     for (auto &I : enumerate(ArgOperands.operands())) {
4949       // Some intrinsics have a scalar argument - don't replace it with a
4950       // vector.
4951       Value *Arg;
4952       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4953         Arg = State.get(I.value(), Part);
4954       else
4955         Arg = State.get(I.value(), VPIteration(0, 0));
4956       Args.push_back(Arg);
4957     }
4958 
4959     Function *VectorF;
4960     if (UseVectorIntrinsic) {
4961       // Use vector version of the intrinsic.
4962       Type *TysForDecl[] = {CI->getType()};
4963       if (VF.isVector())
4964         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4965       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4966       assert(VectorF && "Can't retrieve vector intrinsic.");
4967     } else {
4968       // Use vector version of the function call.
4969       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4970 #ifndef NDEBUG
4971       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4972              "Can't create vector function.");
4973 #endif
4974         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4975     }
4976       SmallVector<OperandBundleDef, 1> OpBundles;
4977       CI->getOperandBundlesAsDefs(OpBundles);
4978       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4979 
4980       if (isa<FPMathOperator>(V))
4981         V->copyFastMathFlags(CI);
4982 
4983       State.set(Def, V, Part);
4984       addMetadata(V, &I);
4985   }
4986 }
4987 
4988 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
4989                                                  VPUser &Operands,
4990                                                  bool InvariantCond,
4991                                                  VPTransformState &State) {
4992   setDebugLocFromInst(Builder, &I);
4993 
4994   // The condition can be loop invariant  but still defined inside the
4995   // loop. This means that we can't just use the original 'cond' value.
4996   // We have to take the 'vectorized' value and pick the first lane.
4997   // Instcombine will make this a no-op.
4998   auto *InvarCond = InvariantCond
4999                         ? State.get(Operands.getOperand(0), VPIteration(0, 0))
5000                         : nullptr;
5001 
5002   for (unsigned Part = 0; Part < UF; ++Part) {
5003     Value *Cond =
5004         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
5005     Value *Op0 = State.get(Operands.getOperand(1), Part);
5006     Value *Op1 = State.get(Operands.getOperand(2), Part);
5007     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
5008     State.set(VPDef, Sel, Part);
5009     addMetadata(Sel, &I);
5010   }
5011 }
5012 
5013 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
5014   // We should not collect Scalars more than once per VF. Right now, this
5015   // function is called from collectUniformsAndScalars(), which already does
5016   // this check. Collecting Scalars for VF=1 does not make any sense.
5017   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
5018          "This function should not be visited twice for the same VF");
5019 
5020   SmallSetVector<Instruction *, 8> Worklist;
5021 
5022   // These sets are used to seed the analysis with pointers used by memory
5023   // accesses that will remain scalar.
5024   SmallSetVector<Instruction *, 8> ScalarPtrs;
5025   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5026   auto *Latch = TheLoop->getLoopLatch();
5027 
5028   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5029   // The pointer operands of loads and stores will be scalar as long as the
5030   // memory access is not a gather or scatter operation. The value operand of a
5031   // store will remain scalar if the store is scalarized.
5032   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5033     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5034     assert(WideningDecision != CM_Unknown &&
5035            "Widening decision should be ready at this moment");
5036     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5037       if (Ptr == Store->getValueOperand())
5038         return WideningDecision == CM_Scalarize;
5039     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
5040            "Ptr is neither a value or pointer operand");
5041     return WideningDecision != CM_GatherScatter;
5042   };
5043 
5044   // A helper that returns true if the given value is a bitcast or
5045   // getelementptr instruction contained in the loop.
5046   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5047     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5048             isa<GetElementPtrInst>(V)) &&
5049            !TheLoop->isLoopInvariant(V);
5050   };
5051 
5052   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
5053     if (!isa<PHINode>(Ptr) ||
5054         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
5055       return false;
5056     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
5057     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
5058       return false;
5059     return isScalarUse(MemAccess, Ptr);
5060   };
5061 
5062   // A helper that evaluates a memory access's use of a pointer. If the
5063   // pointer is actually the pointer induction of a loop, it is being
5064   // inserted into Worklist. If the use will be a scalar use, and the
5065   // pointer is only used by memory accesses, we place the pointer in
5066   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5067   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5068     if (isScalarPtrInduction(MemAccess, Ptr)) {
5069       Worklist.insert(cast<Instruction>(Ptr));
5070       Instruction *Update = cast<Instruction>(
5071           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5072       Worklist.insert(Update);
5073       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
5074                         << "\n");
5075       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
5076                         << "\n");
5077       return;
5078     }
5079     // We only care about bitcast and getelementptr instructions contained in
5080     // the loop.
5081     if (!isLoopVaryingBitCastOrGEP(Ptr))
5082       return;
5083 
5084     // If the pointer has already been identified as scalar (e.g., if it was
5085     // also identified as uniform), there's nothing to do.
5086     auto *I = cast<Instruction>(Ptr);
5087     if (Worklist.count(I))
5088       return;
5089 
5090     // If the use of the pointer will be a scalar use, and all users of the
5091     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5092     // place the pointer in PossibleNonScalarPtrs.
5093     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5094           return isa<LoadInst>(U) || isa<StoreInst>(U);
5095         }))
5096       ScalarPtrs.insert(I);
5097     else
5098       PossibleNonScalarPtrs.insert(I);
5099   };
5100 
5101   // We seed the scalars analysis with three classes of instructions: (1)
5102   // instructions marked uniform-after-vectorization and (2) bitcast,
5103   // getelementptr and (pointer) phi instructions used by memory accesses
5104   // requiring a scalar use.
5105   //
5106   // (1) Add to the worklist all instructions that have been identified as
5107   // uniform-after-vectorization.
5108   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5109 
5110   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5111   // memory accesses requiring a scalar use. The pointer operands of loads and
5112   // stores will be scalar as long as the memory accesses is not a gather or
5113   // scatter operation. The value operand of a store will remain scalar if the
5114   // store is scalarized.
5115   for (auto *BB : TheLoop->blocks())
5116     for (auto &I : *BB) {
5117       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5118         evaluatePtrUse(Load, Load->getPointerOperand());
5119       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5120         evaluatePtrUse(Store, Store->getPointerOperand());
5121         evaluatePtrUse(Store, Store->getValueOperand());
5122       }
5123     }
5124   for (auto *I : ScalarPtrs)
5125     if (!PossibleNonScalarPtrs.count(I)) {
5126       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5127       Worklist.insert(I);
5128     }
5129 
5130   // Insert the forced scalars.
5131   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5132   // induction variable when the PHI user is scalarized.
5133   auto ForcedScalar = ForcedScalars.find(VF);
5134   if (ForcedScalar != ForcedScalars.end())
5135     for (auto *I : ForcedScalar->second)
5136       Worklist.insert(I);
5137 
5138   // Expand the worklist by looking through any bitcasts and getelementptr
5139   // instructions we've already identified as scalar. This is similar to the
5140   // expansion step in collectLoopUniforms(); however, here we're only
5141   // expanding to include additional bitcasts and getelementptr instructions.
5142   unsigned Idx = 0;
5143   while (Idx != Worklist.size()) {
5144     Instruction *Dst = Worklist[Idx++];
5145     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5146       continue;
5147     auto *Src = cast<Instruction>(Dst->getOperand(0));
5148     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5149           auto *J = cast<Instruction>(U);
5150           return !TheLoop->contains(J) || Worklist.count(J) ||
5151                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5152                   isScalarUse(J, Src));
5153         })) {
5154       Worklist.insert(Src);
5155       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5156     }
5157   }
5158 
5159   // An induction variable will remain scalar if all users of the induction
5160   // variable and induction variable update remain scalar.
5161   for (auto &Induction : Legal->getInductionVars()) {
5162     auto *Ind = Induction.first;
5163     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5164 
5165     // If tail-folding is applied, the primary induction variable will be used
5166     // to feed a vector compare.
5167     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5168       continue;
5169 
5170     // Determine if all users of the induction variable are scalar after
5171     // vectorization.
5172     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5173       auto *I = cast<Instruction>(U);
5174       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5175     });
5176     if (!ScalarInd)
5177       continue;
5178 
5179     // Determine if all users of the induction variable update instruction are
5180     // scalar after vectorization.
5181     auto ScalarIndUpdate =
5182         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5183           auto *I = cast<Instruction>(U);
5184           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5185         });
5186     if (!ScalarIndUpdate)
5187       continue;
5188 
5189     // The induction variable and its update instruction will remain scalar.
5190     Worklist.insert(Ind);
5191     Worklist.insert(IndUpdate);
5192     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5193     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5194                       << "\n");
5195   }
5196 
5197   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5198 }
5199 
5200 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
5201                                                          ElementCount VF) {
5202   if (!blockNeedsPredication(I->getParent()))
5203     return false;
5204   switch(I->getOpcode()) {
5205   default:
5206     break;
5207   case Instruction::Load:
5208   case Instruction::Store: {
5209     if (!Legal->isMaskRequired(I))
5210       return false;
5211     auto *Ptr = getLoadStorePointerOperand(I);
5212     auto *Ty = getMemInstValueType(I);
5213     // We have already decided how to vectorize this instruction, get that
5214     // result.
5215     if (VF.isVector()) {
5216       InstWidening WideningDecision = getWideningDecision(I, VF);
5217       assert(WideningDecision != CM_Unknown &&
5218              "Widening decision should be ready at this moment");
5219       return WideningDecision == CM_Scalarize;
5220     }
5221     const Align Alignment = getLoadStoreAlignment(I);
5222     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5223                                 isLegalMaskedGather(Ty, Alignment))
5224                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5225                                 isLegalMaskedScatter(Ty, Alignment));
5226   }
5227   case Instruction::UDiv:
5228   case Instruction::SDiv:
5229   case Instruction::SRem:
5230   case Instruction::URem:
5231     return mayDivideByZero(*I);
5232   }
5233   return false;
5234 }
5235 
5236 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5237     Instruction *I, ElementCount VF) {
5238   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5239   assert(getWideningDecision(I, VF) == CM_Unknown &&
5240          "Decision should not be set yet.");
5241   auto *Group = getInterleavedAccessGroup(I);
5242   assert(Group && "Must have a group.");
5243 
5244   // If the instruction's allocated size doesn't equal it's type size, it
5245   // requires padding and will be scalarized.
5246   auto &DL = I->getModule()->getDataLayout();
5247   auto *ScalarTy = getMemInstValueType(I);
5248   if (hasIrregularType(ScalarTy, DL, VF))
5249     return false;
5250 
5251   // Check if masking is required.
5252   // A Group may need masking for one of two reasons: it resides in a block that
5253   // needs predication, or it was decided to use masking to deal with gaps.
5254   bool PredicatedAccessRequiresMasking =
5255       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5256   bool AccessWithGapsRequiresMasking =
5257       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5258   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5259     return true;
5260 
5261   // If masked interleaving is required, we expect that the user/target had
5262   // enabled it, because otherwise it either wouldn't have been created or
5263   // it should have been invalidated by the CostModel.
5264   assert(useMaskedInterleavedAccesses(TTI) &&
5265          "Masked interleave-groups for predicated accesses are not enabled.");
5266 
5267   auto *Ty = getMemInstValueType(I);
5268   const Align Alignment = getLoadStoreAlignment(I);
5269   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5270                           : TTI.isLegalMaskedStore(Ty, Alignment);
5271 }
5272 
5273 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5274     Instruction *I, ElementCount VF) {
5275   // Get and ensure we have a valid memory instruction.
5276   LoadInst *LI = dyn_cast<LoadInst>(I);
5277   StoreInst *SI = dyn_cast<StoreInst>(I);
5278   assert((LI || SI) && "Invalid memory instruction");
5279 
5280   auto *Ptr = getLoadStorePointerOperand(I);
5281 
5282   // In order to be widened, the pointer should be consecutive, first of all.
5283   if (!Legal->isConsecutivePtr(Ptr))
5284     return false;
5285 
5286   // If the instruction is a store located in a predicated block, it will be
5287   // scalarized.
5288   if (isScalarWithPredication(I))
5289     return false;
5290 
5291   // If the instruction's allocated size doesn't equal it's type size, it
5292   // requires padding and will be scalarized.
5293   auto &DL = I->getModule()->getDataLayout();
5294   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5295   if (hasIrregularType(ScalarTy, DL, VF))
5296     return false;
5297 
5298   return true;
5299 }
5300 
5301 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5302   // We should not collect Uniforms more than once per VF. Right now,
5303   // this function is called from collectUniformsAndScalars(), which
5304   // already does this check. Collecting Uniforms for VF=1 does not make any
5305   // sense.
5306 
5307   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5308          "This function should not be visited twice for the same VF");
5309 
5310   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5311   // not analyze again.  Uniforms.count(VF) will return 1.
5312   Uniforms[VF].clear();
5313 
5314   // We now know that the loop is vectorizable!
5315   // Collect instructions inside the loop that will remain uniform after
5316   // vectorization.
5317 
5318   // Global values, params and instructions outside of current loop are out of
5319   // scope.
5320   auto isOutOfScope = [&](Value *V) -> bool {
5321     Instruction *I = dyn_cast<Instruction>(V);
5322     return (!I || !TheLoop->contains(I));
5323   };
5324 
5325   SetVector<Instruction *> Worklist;
5326   BasicBlock *Latch = TheLoop->getLoopLatch();
5327 
5328   // Instructions that are scalar with predication must not be considered
5329   // uniform after vectorization, because that would create an erroneous
5330   // replicating region where only a single instance out of VF should be formed.
5331   // TODO: optimize such seldom cases if found important, see PR40816.
5332   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5333     if (isOutOfScope(I)) {
5334       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5335                         << *I << "\n");
5336       return;
5337     }
5338     if (isScalarWithPredication(I, VF)) {
5339       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5340                         << *I << "\n");
5341       return;
5342     }
5343     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5344     Worklist.insert(I);
5345   };
5346 
5347   // Start with the conditional branch. If the branch condition is an
5348   // instruction contained in the loop that is only used by the branch, it is
5349   // uniform.
5350   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5351   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5352     addToWorklistIfAllowed(Cmp);
5353 
5354   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5355     InstWidening WideningDecision = getWideningDecision(I, VF);
5356     assert(WideningDecision != CM_Unknown &&
5357            "Widening decision should be ready at this moment");
5358 
5359     // A uniform memory op is itself uniform.  We exclude uniform stores
5360     // here as they demand the last lane, not the first one.
5361     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5362       assert(WideningDecision == CM_Scalarize);
5363       return true;
5364     }
5365 
5366     return (WideningDecision == CM_Widen ||
5367             WideningDecision == CM_Widen_Reverse ||
5368             WideningDecision == CM_Interleave);
5369   };
5370 
5371 
5372   // Returns true if Ptr is the pointer operand of a memory access instruction
5373   // I, and I is known to not require scalarization.
5374   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5375     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5376   };
5377 
5378   // Holds a list of values which are known to have at least one uniform use.
5379   // Note that there may be other uses which aren't uniform.  A "uniform use"
5380   // here is something which only demands lane 0 of the unrolled iterations;
5381   // it does not imply that all lanes produce the same value (e.g. this is not
5382   // the usual meaning of uniform)
5383   SmallPtrSet<Value *, 8> HasUniformUse;
5384 
5385   // Scan the loop for instructions which are either a) known to have only
5386   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5387   for (auto *BB : TheLoop->blocks())
5388     for (auto &I : *BB) {
5389       // If there's no pointer operand, there's nothing to do.
5390       auto *Ptr = getLoadStorePointerOperand(&I);
5391       if (!Ptr)
5392         continue;
5393 
5394       // A uniform memory op is itself uniform.  We exclude uniform stores
5395       // here as they demand the last lane, not the first one.
5396       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5397         addToWorklistIfAllowed(&I);
5398 
5399       if (isUniformDecision(&I, VF)) {
5400         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5401         HasUniformUse.insert(Ptr);
5402       }
5403     }
5404 
5405   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5406   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5407   // disallows uses outside the loop as well.
5408   for (auto *V : HasUniformUse) {
5409     if (isOutOfScope(V))
5410       continue;
5411     auto *I = cast<Instruction>(V);
5412     auto UsersAreMemAccesses =
5413       llvm::all_of(I->users(), [&](User *U) -> bool {
5414         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5415       });
5416     if (UsersAreMemAccesses)
5417       addToWorklistIfAllowed(I);
5418   }
5419 
5420   // Expand Worklist in topological order: whenever a new instruction
5421   // is added , its users should be already inside Worklist.  It ensures
5422   // a uniform instruction will only be used by uniform instructions.
5423   unsigned idx = 0;
5424   while (idx != Worklist.size()) {
5425     Instruction *I = Worklist[idx++];
5426 
5427     for (auto OV : I->operand_values()) {
5428       // isOutOfScope operands cannot be uniform instructions.
5429       if (isOutOfScope(OV))
5430         continue;
5431       // First order recurrence Phi's should typically be considered
5432       // non-uniform.
5433       auto *OP = dyn_cast<PHINode>(OV);
5434       if (OP && Legal->isFirstOrderRecurrence(OP))
5435         continue;
5436       // If all the users of the operand are uniform, then add the
5437       // operand into the uniform worklist.
5438       auto *OI = cast<Instruction>(OV);
5439       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5440             auto *J = cast<Instruction>(U);
5441             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5442           }))
5443         addToWorklistIfAllowed(OI);
5444     }
5445   }
5446 
5447   // For an instruction to be added into Worklist above, all its users inside
5448   // the loop should also be in Worklist. However, this condition cannot be
5449   // true for phi nodes that form a cyclic dependence. We must process phi
5450   // nodes separately. An induction variable will remain uniform if all users
5451   // of the induction variable and induction variable update remain uniform.
5452   // The code below handles both pointer and non-pointer induction variables.
5453   for (auto &Induction : Legal->getInductionVars()) {
5454     auto *Ind = Induction.first;
5455     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5456 
5457     // Determine if all users of the induction variable are uniform after
5458     // vectorization.
5459     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5460       auto *I = cast<Instruction>(U);
5461       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5462              isVectorizedMemAccessUse(I, Ind);
5463     });
5464     if (!UniformInd)
5465       continue;
5466 
5467     // Determine if all users of the induction variable update instruction are
5468     // uniform after vectorization.
5469     auto UniformIndUpdate =
5470         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5471           auto *I = cast<Instruction>(U);
5472           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5473                  isVectorizedMemAccessUse(I, IndUpdate);
5474         });
5475     if (!UniformIndUpdate)
5476       continue;
5477 
5478     // The induction variable and its update instruction will remain uniform.
5479     addToWorklistIfAllowed(Ind);
5480     addToWorklistIfAllowed(IndUpdate);
5481   }
5482 
5483   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5484 }
5485 
5486 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5487   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5488 
5489   if (Legal->getRuntimePointerChecking()->Need) {
5490     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5491         "runtime pointer checks needed. Enable vectorization of this "
5492         "loop with '#pragma clang loop vectorize(enable)' when "
5493         "compiling with -Os/-Oz",
5494         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5495     return true;
5496   }
5497 
5498   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5499     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5500         "runtime SCEV checks needed. Enable vectorization of this "
5501         "loop with '#pragma clang loop vectorize(enable)' when "
5502         "compiling with -Os/-Oz",
5503         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5504     return true;
5505   }
5506 
5507   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5508   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5509     reportVectorizationFailure("Runtime stride check for small trip count",
5510         "runtime stride == 1 checks needed. Enable vectorization of "
5511         "this loop without such check by compiling with -Os/-Oz",
5512         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5513     return true;
5514   }
5515 
5516   return false;
5517 }
5518 
5519 Optional<ElementCount>
5520 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5521   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5522     // TODO: It may by useful to do since it's still likely to be dynamically
5523     // uniform if the target can skip.
5524     reportVectorizationFailure(
5525         "Not inserting runtime ptr check for divergent target",
5526         "runtime pointer checks needed. Not enabled for divergent target",
5527         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5528     return None;
5529   }
5530 
5531   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5532   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5533   if (TC == 1) {
5534     reportVectorizationFailure("Single iteration (non) loop",
5535         "loop trip count is one, irrelevant for vectorization",
5536         "SingleIterationLoop", ORE, TheLoop);
5537     return None;
5538   }
5539 
5540   switch (ScalarEpilogueStatus) {
5541   case CM_ScalarEpilogueAllowed:
5542     return computeFeasibleMaxVF(TC, UserVF);
5543   case CM_ScalarEpilogueNotAllowedUsePredicate:
5544     LLVM_FALLTHROUGH;
5545   case CM_ScalarEpilogueNotNeededUsePredicate:
5546     LLVM_DEBUG(
5547         dbgs() << "LV: vector predicate hint/switch found.\n"
5548                << "LV: Not allowing scalar epilogue, creating predicated "
5549                << "vector loop.\n");
5550     break;
5551   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5552     // fallthrough as a special case of OptForSize
5553   case CM_ScalarEpilogueNotAllowedOptSize:
5554     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5555       LLVM_DEBUG(
5556           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5557     else
5558       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5559                         << "count.\n");
5560 
5561     // Bail if runtime checks are required, which are not good when optimising
5562     // for size.
5563     if (runtimeChecksRequired())
5564       return None;
5565 
5566     break;
5567   }
5568 
5569   // The only loops we can vectorize without a scalar epilogue, are loops with
5570   // a bottom-test and a single exiting block. We'd have to handle the fact
5571   // that not every instruction executes on the last iteration.  This will
5572   // require a lane mask which varies through the vector loop body.  (TODO)
5573   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5574     // If there was a tail-folding hint/switch, but we can't fold the tail by
5575     // masking, fallback to a vectorization with a scalar epilogue.
5576     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5577       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5578                            "scalar epilogue instead.\n");
5579       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5580       return computeFeasibleMaxVF(TC, UserVF);
5581     }
5582     return None;
5583   }
5584 
5585   // Now try the tail folding
5586 
5587   // Invalidate interleave groups that require an epilogue if we can't mask
5588   // the interleave-group.
5589   if (!useMaskedInterleavedAccesses(TTI)) {
5590     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5591            "No decisions should have been taken at this point");
5592     // Note: There is no need to invalidate any cost modeling decisions here, as
5593     // non where taken so far.
5594     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5595   }
5596 
5597   ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
5598   assert(!MaxVF.isScalable() &&
5599          "Scalable vectors do not yet support tail folding");
5600   assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
5601          "MaxVF must be a power of 2");
5602   unsigned MaxVFtimesIC =
5603       UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
5604   // Avoid tail folding if the trip count is known to be a multiple of any VF we
5605   // chose.
5606   ScalarEvolution *SE = PSE.getSE();
5607   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5608   const SCEV *ExitCount = SE->getAddExpr(
5609       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5610   const SCEV *Rem = SE->getURemExpr(
5611       SE->applyLoopGuards(ExitCount, TheLoop),
5612       SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5613   if (Rem->isZero()) {
5614     // Accept MaxVF if we do not have a tail.
5615     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5616     return MaxVF;
5617   }
5618 
5619   // If we don't know the precise trip count, or if the trip count that we
5620   // found modulo the vectorization factor is not zero, try to fold the tail
5621   // by masking.
5622   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5623   if (Legal->prepareToFoldTailByMasking()) {
5624     FoldTailByMasking = true;
5625     return MaxVF;
5626   }
5627 
5628   // If there was a tail-folding hint/switch, but we can't fold the tail by
5629   // masking, fallback to a vectorization with a scalar epilogue.
5630   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5631     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5632                          "scalar epilogue instead.\n");
5633     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5634     return MaxVF;
5635   }
5636 
5637   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5638     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5639     return None;
5640   }
5641 
5642   if (TC == 0) {
5643     reportVectorizationFailure(
5644         "Unable to calculate the loop count due to complex control flow",
5645         "unable to calculate the loop count due to complex control flow",
5646         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5647     return None;
5648   }
5649 
5650   reportVectorizationFailure(
5651       "Cannot optimize for size and vectorize at the same time.",
5652       "cannot optimize for size and vectorize at the same time. "
5653       "Enable vectorization of this loop with '#pragma clang loop "
5654       "vectorize(enable)' when compiling with -Os/-Oz",
5655       "NoTailLoopWithOptForSize", ORE, TheLoop);
5656   return None;
5657 }
5658 
5659 ElementCount
5660 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5661                                                  ElementCount UserVF) {
5662   bool IgnoreScalableUserVF = UserVF.isScalable() &&
5663                               !TTI.supportsScalableVectors() &&
5664                               !ForceTargetSupportsScalableVectors;
5665   if (IgnoreScalableUserVF) {
5666     LLVM_DEBUG(
5667         dbgs() << "LV: Ignoring VF=" << UserVF
5668                << " because target does not support scalable vectors.\n");
5669     ORE->emit([&]() {
5670       return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF",
5671                                         TheLoop->getStartLoc(),
5672                                         TheLoop->getHeader())
5673              << "Ignoring VF=" << ore::NV("UserVF", UserVF)
5674              << " because target does not support scalable vectors.";
5675     });
5676   }
5677 
5678   // Beyond this point two scenarios are handled. If UserVF isn't specified
5679   // then a suitable VF is chosen. If UserVF is specified and there are
5680   // dependencies, check if it's legal. However, if a UserVF is specified and
5681   // there are no dependencies, then there's nothing to do.
5682   if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
5683     if (!canVectorizeReductions(UserVF)) {
5684       reportVectorizationFailure(
5685           "LV: Scalable vectorization not supported for the reduction "
5686           "operations found in this loop. Using fixed-width "
5687           "vectorization instead.",
5688           "Scalable vectorization not supported for the reduction operations "
5689           "found in this loop. Using fixed-width vectorization instead.",
5690           "ScalableVFUnfeasible", ORE, TheLoop);
5691       return computeFeasibleMaxVF(
5692           ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
5693     }
5694 
5695     if (Legal->isSafeForAnyVectorWidth())
5696       return UserVF;
5697   }
5698 
5699   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5700   unsigned SmallestType, WidestType;
5701   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5702   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5703 
5704   // Get the maximum safe dependence distance in bits computed by LAA.
5705   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5706   // the memory accesses that is most restrictive (involved in the smallest
5707   // dependence distance).
5708   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
5709 
5710   // If the user vectorization factor is legally unsafe, clamp it to a safe
5711   // value. Otherwise, return as is.
5712   if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
5713     unsigned MaxSafeElements =
5714         PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
5715     ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
5716 
5717     if (UserVF.isScalable()) {
5718       Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5719 
5720       // Scale VF by vscale before checking if it's safe.
5721       MaxSafeVF = ElementCount::getScalable(
5722           MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5723 
5724       if (MaxSafeVF.isZero()) {
5725         // The dependence distance is too small to use scalable vectors,
5726         // fallback on fixed.
5727         LLVM_DEBUG(
5728             dbgs()
5729             << "LV: Max legal vector width too small, scalable vectorization "
5730                "unfeasible. Using fixed-width vectorization instead.\n");
5731         ORE->emit([&]() {
5732           return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
5733                                             TheLoop->getStartLoc(),
5734                                             TheLoop->getHeader())
5735                  << "Max legal vector width too small, scalable vectorization "
5736                  << "unfeasible. Using fixed-width vectorization instead.";
5737         });
5738         return computeFeasibleMaxVF(
5739             ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
5740       }
5741     }
5742 
5743     LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
5744 
5745     if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
5746       return UserVF;
5747 
5748     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5749                       << " is unsafe, clamping to max safe VF=" << MaxSafeVF
5750                       << ".\n");
5751     ORE->emit([&]() {
5752       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5753                                         TheLoop->getStartLoc(),
5754                                         TheLoop->getHeader())
5755              << "User-specified vectorization factor "
5756              << ore::NV("UserVectorizationFactor", UserVF)
5757              << " is unsafe, clamping to maximum safe vectorization factor "
5758              << ore::NV("VectorizationFactor", MaxSafeVF);
5759     });
5760     return MaxSafeVF;
5761   }
5762 
5763   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
5764 
5765   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5766   // Note that both WidestRegister and WidestType may not be a powers of 2.
5767   auto MaxVectorSize =
5768       ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType));
5769 
5770   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5771                     << " / " << WidestType << " bits.\n");
5772   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5773                     << WidestRegister << " bits.\n");
5774 
5775   assert(MaxVectorSize.getFixedValue() <= WidestRegister &&
5776          "Did not expect to pack so many elements"
5777          " into one vector!");
5778   if (MaxVectorSize.getFixedValue() == 0) {
5779     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5780     return ElementCount::getFixed(1);
5781   } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() &&
5782              isPowerOf2_32(ConstTripCount)) {
5783     // We need to clamp the VF to be the ConstTripCount. There is no point in
5784     // choosing a higher viable VF as done in the loop below.
5785     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5786                       << ConstTripCount << "\n");
5787     return ElementCount::getFixed(ConstTripCount);
5788   }
5789 
5790   ElementCount MaxVF = MaxVectorSize;
5791   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5792       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5793     // Collect all viable vectorization factors larger than the default MaxVF
5794     // (i.e. MaxVectorSize).
5795     SmallVector<ElementCount, 8> VFs;
5796     auto MaxVectorSizeMaxBW =
5797         ElementCount::getFixed(WidestRegister / SmallestType);
5798     for (ElementCount VS = MaxVectorSize * 2;
5799          ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2)
5800       VFs.push_back(VS);
5801 
5802     // For each VF calculate its register usage.
5803     auto RUs = calculateRegisterUsage(VFs);
5804 
5805     // Select the largest VF which doesn't require more registers than existing
5806     // ones.
5807     for (int i = RUs.size() - 1; i >= 0; --i) {
5808       bool Selected = true;
5809       for (auto &pair : RUs[i].MaxLocalUsers) {
5810         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5811         if (pair.second > TargetNumRegisters)
5812           Selected = false;
5813       }
5814       if (Selected) {
5815         MaxVF = VFs[i];
5816         break;
5817       }
5818     }
5819     if (ElementCount MinVF =
5820             TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) {
5821       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5822         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5823                           << ") with target's minimum: " << MinVF << '\n');
5824         MaxVF = MinVF;
5825       }
5826     }
5827   }
5828   return MaxVF;
5829 }
5830 
5831 VectorizationFactor
5832 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
5833   // FIXME: This can be fixed for scalable vectors later, because at this stage
5834   // the LoopVectorizer will only consider vectorizing a loop with scalable
5835   // vectors when the loop has a hint to enable vectorization for a given VF.
5836   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
5837 
5838   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5839   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5840   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5841 
5842   auto Width = ElementCount::getFixed(1);
5843   const float ScalarCost = *ExpectedCost.getValue();
5844   float Cost = ScalarCost;
5845 
5846   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5847   if (ForceVectorization && MaxVF.isVector()) {
5848     // Ignore scalar width, because the user explicitly wants vectorization.
5849     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5850     // evaluation.
5851     Cost = std::numeric_limits<float>::max();
5852   }
5853 
5854   for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF);
5855        i *= 2) {
5856     // Notice that the vector loop needs to be executed less times, so
5857     // we need to divide the cost of the vector loops by the width of
5858     // the vector elements.
5859     VectorizationCostTy C = expectedCost(i);
5860     assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
5861     float VectorCost = *C.first.getValue() / (float)i.getFixedValue();
5862     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5863                       << " costs: " << (int)VectorCost << ".\n");
5864     if (!C.second && !ForceVectorization) {
5865       LLVM_DEBUG(
5866           dbgs() << "LV: Not considering vector loop of width " << i
5867                  << " because it will not generate any vector instructions.\n");
5868       continue;
5869     }
5870 
5871     // If profitable add it to ProfitableVF list.
5872     if (VectorCost < ScalarCost) {
5873       ProfitableVFs.push_back(VectorizationFactor(
5874           {i, (unsigned)VectorCost}));
5875     }
5876 
5877     if (VectorCost < Cost) {
5878       Cost = VectorCost;
5879       Width = i;
5880     }
5881   }
5882 
5883   if (!EnableCondStoresVectorization && NumPredStores) {
5884     reportVectorizationFailure("There are conditional stores.",
5885         "store that is conditionally executed prevents vectorization",
5886         "ConditionalStore", ORE, TheLoop);
5887     Width = ElementCount::getFixed(1);
5888     Cost = ScalarCost;
5889   }
5890 
5891   LLVM_DEBUG(if (ForceVectorization && !Width.isScalar() && Cost >= ScalarCost) dbgs()
5892              << "LV: Vectorization seems to be not beneficial, "
5893              << "but was forced by a user.\n");
5894   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5895   VectorizationFactor Factor = {Width,
5896                                 (unsigned)(Width.getKnownMinValue() * Cost)};
5897   return Factor;
5898 }
5899 
5900 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5901     const Loop &L, ElementCount VF) const {
5902   // Cross iteration phis such as reductions need special handling and are
5903   // currently unsupported.
5904   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5905         return Legal->isFirstOrderRecurrence(&Phi) ||
5906                Legal->isReductionVariable(&Phi);
5907       }))
5908     return false;
5909 
5910   // Phis with uses outside of the loop require special handling and are
5911   // currently unsupported.
5912   for (auto &Entry : Legal->getInductionVars()) {
5913     // Look for uses of the value of the induction at the last iteration.
5914     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5915     for (User *U : PostInc->users())
5916       if (!L.contains(cast<Instruction>(U)))
5917         return false;
5918     // Look for uses of penultimate value of the induction.
5919     for (User *U : Entry.first->users())
5920       if (!L.contains(cast<Instruction>(U)))
5921         return false;
5922   }
5923 
5924   // Induction variables that are widened require special handling that is
5925   // currently not supported.
5926   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5927         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5928                  this->isProfitableToScalarize(Entry.first, VF));
5929       }))
5930     return false;
5931 
5932   return true;
5933 }
5934 
5935 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5936     const ElementCount VF) const {
5937   // FIXME: We need a much better cost-model to take different parameters such
5938   // as register pressure, code size increase and cost of extra branches into
5939   // account. For now we apply a very crude heuristic and only consider loops
5940   // with vectorization factors larger than a certain value.
5941   // We also consider epilogue vectorization unprofitable for targets that don't
5942   // consider interleaving beneficial (eg. MVE).
5943   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5944     return false;
5945   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5946     return true;
5947   return false;
5948 }
5949 
5950 VectorizationFactor
5951 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5952     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5953   VectorizationFactor Result = VectorizationFactor::Disabled();
5954   if (!EnableEpilogueVectorization) {
5955     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5956     return Result;
5957   }
5958 
5959   if (!isScalarEpilogueAllowed()) {
5960     LLVM_DEBUG(
5961         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5962                   "allowed.\n";);
5963     return Result;
5964   }
5965 
5966   // FIXME: This can be fixed for scalable vectors later, because at this stage
5967   // the LoopVectorizer will only consider vectorizing a loop with scalable
5968   // vectors when the loop has a hint to enable vectorization for a given VF.
5969   if (MainLoopVF.isScalable()) {
5970     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
5971                          "yet supported.\n");
5972     return Result;
5973   }
5974 
5975   // Not really a cost consideration, but check for unsupported cases here to
5976   // simplify the logic.
5977   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5978     LLVM_DEBUG(
5979         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5980                   "not a supported candidate.\n";);
5981     return Result;
5982   }
5983 
5984   if (EpilogueVectorizationForceVF > 1) {
5985     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5986     if (LVP.hasPlanWithVFs(
5987             {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
5988       return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
5989     else {
5990       LLVM_DEBUG(
5991           dbgs()
5992               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5993       return Result;
5994     }
5995   }
5996 
5997   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5998       TheLoop->getHeader()->getParent()->hasMinSize()) {
5999     LLVM_DEBUG(
6000         dbgs()
6001             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
6002     return Result;
6003   }
6004 
6005   if (!isEpilogueVectorizationProfitable(MainLoopVF))
6006     return Result;
6007 
6008   for (auto &NextVF : ProfitableVFs)
6009     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
6010         (Result.Width.getFixedValue() == 1 || NextVF.Cost < Result.Cost) &&
6011         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
6012       Result = NextVF;
6013 
6014   if (Result != VectorizationFactor::Disabled())
6015     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
6016                       << Result.Width.getFixedValue() << "\n";);
6017   return Result;
6018 }
6019 
6020 std::pair<unsigned, unsigned>
6021 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
6022   unsigned MinWidth = -1U;
6023   unsigned MaxWidth = 8;
6024   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6025 
6026   // For each block.
6027   for (BasicBlock *BB : TheLoop->blocks()) {
6028     // For each instruction in the loop.
6029     for (Instruction &I : BB->instructionsWithoutDebug()) {
6030       Type *T = I.getType();
6031 
6032       // Skip ignored values.
6033       if (ValuesToIgnore.count(&I))
6034         continue;
6035 
6036       // Only examine Loads, Stores and PHINodes.
6037       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6038         continue;
6039 
6040       // Examine PHI nodes that are reduction variables. Update the type to
6041       // account for the recurrence type.
6042       if (auto *PN = dyn_cast<PHINode>(&I)) {
6043         if (!Legal->isReductionVariable(PN))
6044           continue;
6045         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
6046         if (PreferInLoopReductions ||
6047             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
6048                                       RdxDesc.getRecurrenceType(),
6049                                       TargetTransformInfo::ReductionFlags()))
6050           continue;
6051         T = RdxDesc.getRecurrenceType();
6052       }
6053 
6054       // Examine the stored values.
6055       if (auto *ST = dyn_cast<StoreInst>(&I))
6056         T = ST->getValueOperand()->getType();
6057 
6058       // Ignore loaded pointer types and stored pointer types that are not
6059       // vectorizable.
6060       //
6061       // FIXME: The check here attempts to predict whether a load or store will
6062       //        be vectorized. We only know this for certain after a VF has
6063       //        been selected. Here, we assume that if an access can be
6064       //        vectorized, it will be. We should also look at extending this
6065       //        optimization to non-pointer types.
6066       //
6067       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6068           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
6069         continue;
6070 
6071       MinWidth = std::min(MinWidth,
6072                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6073       MaxWidth = std::max(MaxWidth,
6074                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6075     }
6076   }
6077 
6078   return {MinWidth, MaxWidth};
6079 }
6080 
6081 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6082                                                            unsigned LoopCost) {
6083   // -- The interleave heuristics --
6084   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6085   // There are many micro-architectural considerations that we can't predict
6086   // at this level. For example, frontend pressure (on decode or fetch) due to
6087   // code size, or the number and capabilities of the execution ports.
6088   //
6089   // We use the following heuristics to select the interleave count:
6090   // 1. If the code has reductions, then we interleave to break the cross
6091   // iteration dependency.
6092   // 2. If the loop is really small, then we interleave to reduce the loop
6093   // overhead.
6094   // 3. We don't interleave if we think that we will spill registers to memory
6095   // due to the increased register pressure.
6096 
6097   if (!isScalarEpilogueAllowed())
6098     return 1;
6099 
6100   // We used the distance for the interleave count.
6101   if (Legal->getMaxSafeDepDistBytes() != -1U)
6102     return 1;
6103 
6104   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6105   const bool HasReductions = !Legal->getReductionVars().empty();
6106   // Do not interleave loops with a relatively small known or estimated trip
6107   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6108   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6109   // because with the above conditions interleaving can expose ILP and break
6110   // cross iteration dependences for reductions.
6111   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6112       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6113     return 1;
6114 
6115   RegisterUsage R = calculateRegisterUsage({VF})[0];
6116   // We divide by these constants so assume that we have at least one
6117   // instruction that uses at least one register.
6118   for (auto& pair : R.MaxLocalUsers) {
6119     pair.second = std::max(pair.second, 1U);
6120   }
6121 
6122   // We calculate the interleave count using the following formula.
6123   // Subtract the number of loop invariants from the number of available
6124   // registers. These registers are used by all of the interleaved instances.
6125   // Next, divide the remaining registers by the number of registers that is
6126   // required by the loop, in order to estimate how many parallel instances
6127   // fit without causing spills. All of this is rounded down if necessary to be
6128   // a power of two. We want power of two interleave count to simplify any
6129   // addressing operations or alignment considerations.
6130   // We also want power of two interleave counts to ensure that the induction
6131   // variable of the vector loop wraps to zero, when tail is folded by masking;
6132   // this currently happens when OptForSize, in which case IC is set to 1 above.
6133   unsigned IC = UINT_MAX;
6134 
6135   for (auto& pair : R.MaxLocalUsers) {
6136     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6137     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6138                       << " registers of "
6139                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6140     if (VF.isScalar()) {
6141       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6142         TargetNumRegisters = ForceTargetNumScalarRegs;
6143     } else {
6144       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6145         TargetNumRegisters = ForceTargetNumVectorRegs;
6146     }
6147     unsigned MaxLocalUsers = pair.second;
6148     unsigned LoopInvariantRegs = 0;
6149     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6150       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6151 
6152     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6153     // Don't count the induction variable as interleaved.
6154     if (EnableIndVarRegisterHeur) {
6155       TmpIC =
6156           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6157                         std::max(1U, (MaxLocalUsers - 1)));
6158     }
6159 
6160     IC = std::min(IC, TmpIC);
6161   }
6162 
6163   // Clamp the interleave ranges to reasonable counts.
6164   unsigned MaxInterleaveCount =
6165       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6166 
6167   // Check if the user has overridden the max.
6168   if (VF.isScalar()) {
6169     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6170       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6171   } else {
6172     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6173       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6174   }
6175 
6176   // If trip count is known or estimated compile time constant, limit the
6177   // interleave count to be less than the trip count divided by VF, provided it
6178   // is at least 1.
6179   //
6180   // For scalable vectors we can't know if interleaving is beneficial. It may
6181   // not be beneficial for small loops if none of the lanes in the second vector
6182   // iterations is enabled. However, for larger loops, there is likely to be a
6183   // similar benefit as for fixed-width vectors. For now, we choose to leave
6184   // the InterleaveCount as if vscale is '1', although if some information about
6185   // the vector is known (e.g. min vector size), we can make a better decision.
6186   if (BestKnownTC) {
6187     MaxInterleaveCount =
6188         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6189     // Make sure MaxInterleaveCount is greater than 0.
6190     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6191   }
6192 
6193   assert(MaxInterleaveCount > 0 &&
6194          "Maximum interleave count must be greater than 0");
6195 
6196   // Clamp the calculated IC to be between the 1 and the max interleave count
6197   // that the target and trip count allows.
6198   if (IC > MaxInterleaveCount)
6199     IC = MaxInterleaveCount;
6200   else
6201     // Make sure IC is greater than 0.
6202     IC = std::max(1u, IC);
6203 
6204   assert(IC > 0 && "Interleave count must be greater than 0.");
6205 
6206   // If we did not calculate the cost for VF (because the user selected the VF)
6207   // then we calculate the cost of VF here.
6208   if (LoopCost == 0) {
6209     assert(expectedCost(VF).first.isValid() && "Expected a valid cost");
6210     LoopCost = *expectedCost(VF).first.getValue();
6211   }
6212 
6213   assert(LoopCost && "Non-zero loop cost expected");
6214 
6215   // Interleave if we vectorized this loop and there is a reduction that could
6216   // benefit from interleaving.
6217   if (VF.isVector() && HasReductions) {
6218     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6219     return IC;
6220   }
6221 
6222   // Note that if we've already vectorized the loop we will have done the
6223   // runtime check and so interleaving won't require further checks.
6224   bool InterleavingRequiresRuntimePointerCheck =
6225       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6226 
6227   // We want to interleave small loops in order to reduce the loop overhead and
6228   // potentially expose ILP opportunities.
6229   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6230                     << "LV: IC is " << IC << '\n'
6231                     << "LV: VF is " << VF << '\n');
6232   const bool AggressivelyInterleaveReductions =
6233       TTI.enableAggressiveInterleaving(HasReductions);
6234   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6235     // We assume that the cost overhead is 1 and we use the cost model
6236     // to estimate the cost of the loop and interleave until the cost of the
6237     // loop overhead is about 5% of the cost of the loop.
6238     unsigned SmallIC =
6239         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6240 
6241     // Interleave until store/load ports (estimated by max interleave count) are
6242     // saturated.
6243     unsigned NumStores = Legal->getNumStores();
6244     unsigned NumLoads = Legal->getNumLoads();
6245     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6246     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6247 
6248     // If we have a scalar reduction (vector reductions are already dealt with
6249     // by this point), we can increase the critical path length if the loop
6250     // we're interleaving is inside another loop. Limit, by default to 2, so the
6251     // critical path only gets increased by one reduction operation.
6252     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6253       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6254       SmallIC = std::min(SmallIC, F);
6255       StoresIC = std::min(StoresIC, F);
6256       LoadsIC = std::min(LoadsIC, F);
6257     }
6258 
6259     if (EnableLoadStoreRuntimeInterleave &&
6260         std::max(StoresIC, LoadsIC) > SmallIC) {
6261       LLVM_DEBUG(
6262           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6263       return std::max(StoresIC, LoadsIC);
6264     }
6265 
6266     // If there are scalar reductions and TTI has enabled aggressive
6267     // interleaving for reductions, we will interleave to expose ILP.
6268     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6269         AggressivelyInterleaveReductions) {
6270       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6271       // Interleave no less than SmallIC but not as aggressive as the normal IC
6272       // to satisfy the rare situation when resources are too limited.
6273       return std::max(IC / 2, SmallIC);
6274     } else {
6275       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6276       return SmallIC;
6277     }
6278   }
6279 
6280   // Interleave if this is a large loop (small loops are already dealt with by
6281   // this point) that could benefit from interleaving.
6282   if (AggressivelyInterleaveReductions) {
6283     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6284     return IC;
6285   }
6286 
6287   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6288   return 1;
6289 }
6290 
6291 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6292 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6293   // This function calculates the register usage by measuring the highest number
6294   // of values that are alive at a single location. Obviously, this is a very
6295   // rough estimation. We scan the loop in a topological order in order and
6296   // assign a number to each instruction. We use RPO to ensure that defs are
6297   // met before their users. We assume that each instruction that has in-loop
6298   // users starts an interval. We record every time that an in-loop value is
6299   // used, so we have a list of the first and last occurrences of each
6300   // instruction. Next, we transpose this data structure into a multi map that
6301   // holds the list of intervals that *end* at a specific location. This multi
6302   // map allows us to perform a linear search. We scan the instructions linearly
6303   // and record each time that a new interval starts, by placing it in a set.
6304   // If we find this value in the multi-map then we remove it from the set.
6305   // The max register usage is the maximum size of the set.
6306   // We also search for instructions that are defined outside the loop, but are
6307   // used inside the loop. We need this number separately from the max-interval
6308   // usage number because when we unroll, loop-invariant values do not take
6309   // more register.
6310   LoopBlocksDFS DFS(TheLoop);
6311   DFS.perform(LI);
6312 
6313   RegisterUsage RU;
6314 
6315   // Each 'key' in the map opens a new interval. The values
6316   // of the map are the index of the 'last seen' usage of the
6317   // instruction that is the key.
6318   using IntervalMap = DenseMap<Instruction *, unsigned>;
6319 
6320   // Maps instruction to its index.
6321   SmallVector<Instruction *, 64> IdxToInstr;
6322   // Marks the end of each interval.
6323   IntervalMap EndPoint;
6324   // Saves the list of instruction indices that are used in the loop.
6325   SmallPtrSet<Instruction *, 8> Ends;
6326   // Saves the list of values that are used in the loop but are
6327   // defined outside the loop, such as arguments and constants.
6328   SmallPtrSet<Value *, 8> LoopInvariants;
6329 
6330   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6331     for (Instruction &I : BB->instructionsWithoutDebug()) {
6332       IdxToInstr.push_back(&I);
6333 
6334       // Save the end location of each USE.
6335       for (Value *U : I.operands()) {
6336         auto *Instr = dyn_cast<Instruction>(U);
6337 
6338         // Ignore non-instruction values such as arguments, constants, etc.
6339         if (!Instr)
6340           continue;
6341 
6342         // If this instruction is outside the loop then record it and continue.
6343         if (!TheLoop->contains(Instr)) {
6344           LoopInvariants.insert(Instr);
6345           continue;
6346         }
6347 
6348         // Overwrite previous end points.
6349         EndPoint[Instr] = IdxToInstr.size();
6350         Ends.insert(Instr);
6351       }
6352     }
6353   }
6354 
6355   // Saves the list of intervals that end with the index in 'key'.
6356   using InstrList = SmallVector<Instruction *, 2>;
6357   DenseMap<unsigned, InstrList> TransposeEnds;
6358 
6359   // Transpose the EndPoints to a list of values that end at each index.
6360   for (auto &Interval : EndPoint)
6361     TransposeEnds[Interval.second].push_back(Interval.first);
6362 
6363   SmallPtrSet<Instruction *, 8> OpenIntervals;
6364   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6365   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6366 
6367   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6368 
6369   // A lambda that gets the register usage for the given type and VF.
6370   const auto &TTICapture = TTI;
6371   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6372     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6373       return 0U;
6374     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6375   };
6376 
6377   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6378     Instruction *I = IdxToInstr[i];
6379 
6380     // Remove all of the instructions that end at this location.
6381     InstrList &List = TransposeEnds[i];
6382     for (Instruction *ToRemove : List)
6383       OpenIntervals.erase(ToRemove);
6384 
6385     // Ignore instructions that are never used within the loop.
6386     if (!Ends.count(I))
6387       continue;
6388 
6389     // Skip ignored values.
6390     if (ValuesToIgnore.count(I))
6391       continue;
6392 
6393     // For each VF find the maximum usage of registers.
6394     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6395       // Count the number of live intervals.
6396       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6397 
6398       if (VFs[j].isScalar()) {
6399         for (auto Inst : OpenIntervals) {
6400           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6401           if (RegUsage.find(ClassID) == RegUsage.end())
6402             RegUsage[ClassID] = 1;
6403           else
6404             RegUsage[ClassID] += 1;
6405         }
6406       } else {
6407         collectUniformsAndScalars(VFs[j]);
6408         for (auto Inst : OpenIntervals) {
6409           // Skip ignored values for VF > 1.
6410           if (VecValuesToIgnore.count(Inst))
6411             continue;
6412           if (isScalarAfterVectorization(Inst, VFs[j])) {
6413             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6414             if (RegUsage.find(ClassID) == RegUsage.end())
6415               RegUsage[ClassID] = 1;
6416             else
6417               RegUsage[ClassID] += 1;
6418           } else {
6419             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6420             if (RegUsage.find(ClassID) == RegUsage.end())
6421               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6422             else
6423               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6424           }
6425         }
6426       }
6427 
6428       for (auto& pair : RegUsage) {
6429         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6430           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6431         else
6432           MaxUsages[j][pair.first] = pair.second;
6433       }
6434     }
6435 
6436     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6437                       << OpenIntervals.size() << '\n');
6438 
6439     // Add the current instruction to the list of open intervals.
6440     OpenIntervals.insert(I);
6441   }
6442 
6443   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6444     SmallMapVector<unsigned, unsigned, 4> Invariant;
6445 
6446     for (auto Inst : LoopInvariants) {
6447       unsigned Usage =
6448           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6449       unsigned ClassID =
6450           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6451       if (Invariant.find(ClassID) == Invariant.end())
6452         Invariant[ClassID] = Usage;
6453       else
6454         Invariant[ClassID] += Usage;
6455     }
6456 
6457     LLVM_DEBUG({
6458       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6459       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6460              << " item\n";
6461       for (const auto &pair : MaxUsages[i]) {
6462         dbgs() << "LV(REG): RegisterClass: "
6463                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6464                << " registers\n";
6465       }
6466       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6467              << " item\n";
6468       for (const auto &pair : Invariant) {
6469         dbgs() << "LV(REG): RegisterClass: "
6470                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6471                << " registers\n";
6472       }
6473     });
6474 
6475     RU.LoopInvariantRegs = Invariant;
6476     RU.MaxLocalUsers = MaxUsages[i];
6477     RUs[i] = RU;
6478   }
6479 
6480   return RUs;
6481 }
6482 
6483 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6484   // TODO: Cost model for emulated masked load/store is completely
6485   // broken. This hack guides the cost model to use an artificially
6486   // high enough value to practically disable vectorization with such
6487   // operations, except where previously deployed legality hack allowed
6488   // using very low cost values. This is to avoid regressions coming simply
6489   // from moving "masked load/store" check from legality to cost model.
6490   // Masked Load/Gather emulation was previously never allowed.
6491   // Limited number of Masked Store/Scatter emulation was allowed.
6492   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
6493   return isa<LoadInst>(I) ||
6494          (isa<StoreInst>(I) &&
6495           NumPredStores > NumberOfStoresToPredicate);
6496 }
6497 
6498 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6499   // If we aren't vectorizing the loop, or if we've already collected the
6500   // instructions to scalarize, there's nothing to do. Collection may already
6501   // have occurred if we have a user-selected VF and are now computing the
6502   // expected cost for interleaving.
6503   if (VF.isScalar() || VF.isZero() ||
6504       InstsToScalarize.find(VF) != InstsToScalarize.end())
6505     return;
6506 
6507   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6508   // not profitable to scalarize any instructions, the presence of VF in the
6509   // map will indicate that we've analyzed it already.
6510   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6511 
6512   // Find all the instructions that are scalar with predication in the loop and
6513   // determine if it would be better to not if-convert the blocks they are in.
6514   // If so, we also record the instructions to scalarize.
6515   for (BasicBlock *BB : TheLoop->blocks()) {
6516     if (!blockNeedsPredication(BB))
6517       continue;
6518     for (Instruction &I : *BB)
6519       if (isScalarWithPredication(&I)) {
6520         ScalarCostsTy ScalarCosts;
6521         // Do not apply discount logic if hacked cost is needed
6522         // for emulated masked memrefs.
6523         if (!useEmulatedMaskMemRefHack(&I) &&
6524             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6525           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6526         // Remember that BB will remain after vectorization.
6527         PredicatedBBsAfterVectorization.insert(BB);
6528       }
6529   }
6530 }
6531 
6532 int LoopVectorizationCostModel::computePredInstDiscount(
6533     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6534   assert(!isUniformAfterVectorization(PredInst, VF) &&
6535          "Instruction marked uniform-after-vectorization will be predicated");
6536 
6537   // Initialize the discount to zero, meaning that the scalar version and the
6538   // vector version cost the same.
6539   InstructionCost Discount = 0;
6540 
6541   // Holds instructions to analyze. The instructions we visit are mapped in
6542   // ScalarCosts. Those instructions are the ones that would be scalarized if
6543   // we find that the scalar version costs less.
6544   SmallVector<Instruction *, 8> Worklist;
6545 
6546   // Returns true if the given instruction can be scalarized.
6547   auto canBeScalarized = [&](Instruction *I) -> bool {
6548     // We only attempt to scalarize instructions forming a single-use chain
6549     // from the original predicated block that would otherwise be vectorized.
6550     // Although not strictly necessary, we give up on instructions we know will
6551     // already be scalar to avoid traversing chains that are unlikely to be
6552     // beneficial.
6553     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6554         isScalarAfterVectorization(I, VF))
6555       return false;
6556 
6557     // If the instruction is scalar with predication, it will be analyzed
6558     // separately. We ignore it within the context of PredInst.
6559     if (isScalarWithPredication(I))
6560       return false;
6561 
6562     // If any of the instruction's operands are uniform after vectorization,
6563     // the instruction cannot be scalarized. This prevents, for example, a
6564     // masked load from being scalarized.
6565     //
6566     // We assume we will only emit a value for lane zero of an instruction
6567     // marked uniform after vectorization, rather than VF identical values.
6568     // Thus, if we scalarize an instruction that uses a uniform, we would
6569     // create uses of values corresponding to the lanes we aren't emitting code
6570     // for. This behavior can be changed by allowing getScalarValue to clone
6571     // the lane zero values for uniforms rather than asserting.
6572     for (Use &U : I->operands())
6573       if (auto *J = dyn_cast<Instruction>(U.get()))
6574         if (isUniformAfterVectorization(J, VF))
6575           return false;
6576 
6577     // Otherwise, we can scalarize the instruction.
6578     return true;
6579   };
6580 
6581   // Compute the expected cost discount from scalarizing the entire expression
6582   // feeding the predicated instruction. We currently only consider expressions
6583   // that are single-use instruction chains.
6584   Worklist.push_back(PredInst);
6585   while (!Worklist.empty()) {
6586     Instruction *I = Worklist.pop_back_val();
6587 
6588     // If we've already analyzed the instruction, there's nothing to do.
6589     if (ScalarCosts.find(I) != ScalarCosts.end())
6590       continue;
6591 
6592     // Compute the cost of the vector instruction. Note that this cost already
6593     // includes the scalarization overhead of the predicated instruction.
6594     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6595 
6596     // Compute the cost of the scalarized instruction. This cost is the cost of
6597     // the instruction as if it wasn't if-converted and instead remained in the
6598     // predicated block. We will scale this cost by block probability after
6599     // computing the scalarization overhead.
6600     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6601     InstructionCost ScalarCost =
6602         VF.getKnownMinValue() *
6603         getInstructionCost(I, ElementCount::getFixed(1)).first;
6604 
6605     // Compute the scalarization overhead of needed insertelement instructions
6606     // and phi nodes.
6607     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6608       ScalarCost += TTI.getScalarizationOverhead(
6609           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6610           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6611       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6612       ScalarCost +=
6613           VF.getKnownMinValue() *
6614           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6615     }
6616 
6617     // Compute the scalarization overhead of needed extractelement
6618     // instructions. For each of the instruction's operands, if the operand can
6619     // be scalarized, add it to the worklist; otherwise, account for the
6620     // overhead.
6621     for (Use &U : I->operands())
6622       if (auto *J = dyn_cast<Instruction>(U.get())) {
6623         assert(VectorType::isValidElementType(J->getType()) &&
6624                "Instruction has non-scalar type");
6625         if (canBeScalarized(J))
6626           Worklist.push_back(J);
6627         else if (needsExtract(J, VF)) {
6628           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6629           ScalarCost += TTI.getScalarizationOverhead(
6630               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6631               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6632         }
6633       }
6634 
6635     // Scale the total scalar cost by block probability.
6636     ScalarCost /= getReciprocalPredBlockProb();
6637 
6638     // Compute the discount. A non-negative discount means the vector version
6639     // of the instruction costs more, and scalarizing would be beneficial.
6640     Discount += VectorCost - ScalarCost;
6641     ScalarCosts[I] = ScalarCost;
6642   }
6643 
6644   return *Discount.getValue();
6645 }
6646 
6647 LoopVectorizationCostModel::VectorizationCostTy
6648 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6649   VectorizationCostTy Cost;
6650 
6651   // For each block.
6652   for (BasicBlock *BB : TheLoop->blocks()) {
6653     VectorizationCostTy BlockCost;
6654 
6655     // For each instruction in the old loop.
6656     for (Instruction &I : BB->instructionsWithoutDebug()) {
6657       // Skip ignored values.
6658       if (ValuesToIgnore.count(&I) ||
6659           (VF.isVector() && VecValuesToIgnore.count(&I)))
6660         continue;
6661 
6662       VectorizationCostTy C = getInstructionCost(&I, VF);
6663 
6664       // Check if we should override the cost.
6665       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6666         C.first = InstructionCost(ForceTargetInstructionCost);
6667 
6668       BlockCost.first += C.first;
6669       BlockCost.second |= C.second;
6670       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6671                         << " for VF " << VF << " For instruction: " << I
6672                         << '\n');
6673     }
6674 
6675     // If we are vectorizing a predicated block, it will have been
6676     // if-converted. This means that the block's instructions (aside from
6677     // stores and instructions that may divide by zero) will now be
6678     // unconditionally executed. For the scalar case, we may not always execute
6679     // the predicated block, if it is an if-else block. Thus, scale the block's
6680     // cost by the probability of executing it. blockNeedsPredication from
6681     // Legal is used so as to not include all blocks in tail folded loops.
6682     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6683       BlockCost.first /= getReciprocalPredBlockProb();
6684 
6685     Cost.first += BlockCost.first;
6686     Cost.second |= BlockCost.second;
6687   }
6688 
6689   return Cost;
6690 }
6691 
6692 /// Gets Address Access SCEV after verifying that the access pattern
6693 /// is loop invariant except the induction variable dependence.
6694 ///
6695 /// This SCEV can be sent to the Target in order to estimate the address
6696 /// calculation cost.
6697 static const SCEV *getAddressAccessSCEV(
6698               Value *Ptr,
6699               LoopVectorizationLegality *Legal,
6700               PredicatedScalarEvolution &PSE,
6701               const Loop *TheLoop) {
6702 
6703   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6704   if (!Gep)
6705     return nullptr;
6706 
6707   // We are looking for a gep with all loop invariant indices except for one
6708   // which should be an induction variable.
6709   auto SE = PSE.getSE();
6710   unsigned NumOperands = Gep->getNumOperands();
6711   for (unsigned i = 1; i < NumOperands; ++i) {
6712     Value *Opd = Gep->getOperand(i);
6713     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6714         !Legal->isInductionVariable(Opd))
6715       return nullptr;
6716   }
6717 
6718   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6719   return PSE.getSCEV(Ptr);
6720 }
6721 
6722 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6723   return Legal->hasStride(I->getOperand(0)) ||
6724          Legal->hasStride(I->getOperand(1));
6725 }
6726 
6727 InstructionCost
6728 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6729                                                         ElementCount VF) {
6730   assert(VF.isVector() &&
6731          "Scalarization cost of instruction implies vectorization.");
6732   assert(!VF.isScalable() && "scalable vectors not yet supported.");
6733   Type *ValTy = getMemInstValueType(I);
6734   auto SE = PSE.getSE();
6735 
6736   unsigned AS = getLoadStoreAddressSpace(I);
6737   Value *Ptr = getLoadStorePointerOperand(I);
6738   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6739 
6740   // Figure out whether the access is strided and get the stride value
6741   // if it's known in compile time
6742   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6743 
6744   // Get the cost of the scalar memory instruction and address computation.
6745   InstructionCost Cost =
6746       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6747 
6748   // Don't pass *I here, since it is scalar but will actually be part of a
6749   // vectorized loop where the user of it is a vectorized instruction.
6750   const Align Alignment = getLoadStoreAlignment(I);
6751   Cost += VF.getKnownMinValue() *
6752           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6753                               AS, TTI::TCK_RecipThroughput);
6754 
6755   // Get the overhead of the extractelement and insertelement instructions
6756   // we might create due to scalarization.
6757   Cost += getScalarizationOverhead(I, VF);
6758 
6759   // If we have a predicated store, it may not be executed for each vector
6760   // lane. Scale the cost by the probability of executing the predicated
6761   // block.
6762   if (isPredicatedInst(I)) {
6763     Cost /= getReciprocalPredBlockProb();
6764 
6765     if (useEmulatedMaskMemRefHack(I))
6766       // Artificially setting to a high enough value to practically disable
6767       // vectorization with such operations.
6768       Cost = 3000000;
6769   }
6770 
6771   return Cost;
6772 }
6773 
6774 InstructionCost
6775 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6776                                                     ElementCount VF) {
6777   Type *ValTy = getMemInstValueType(I);
6778   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6779   Value *Ptr = getLoadStorePointerOperand(I);
6780   unsigned AS = getLoadStoreAddressSpace(I);
6781   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
6782   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6783 
6784   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6785          "Stride should be 1 or -1 for consecutive memory access");
6786   const Align Alignment = getLoadStoreAlignment(I);
6787   InstructionCost Cost = 0;
6788   if (Legal->isMaskRequired(I))
6789     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6790                                       CostKind);
6791   else
6792     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6793                                 CostKind, I);
6794 
6795   bool Reverse = ConsecutiveStride < 0;
6796   if (Reverse)
6797     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6798   return Cost;
6799 }
6800 
6801 InstructionCost
6802 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6803                                                 ElementCount VF) {
6804   assert(Legal->isUniformMemOp(*I));
6805 
6806   Type *ValTy = getMemInstValueType(I);
6807   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6808   const Align Alignment = getLoadStoreAlignment(I);
6809   unsigned AS = getLoadStoreAddressSpace(I);
6810   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6811   if (isa<LoadInst>(I)) {
6812     return TTI.getAddressComputationCost(ValTy) +
6813            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6814                                CostKind) +
6815            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6816   }
6817   StoreInst *SI = cast<StoreInst>(I);
6818 
6819   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6820   return TTI.getAddressComputationCost(ValTy) +
6821          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6822                              CostKind) +
6823          (isLoopInvariantStoreValue
6824               ? 0
6825               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6826                                        VF.getKnownMinValue() - 1));
6827 }
6828 
6829 InstructionCost
6830 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6831                                                  ElementCount VF) {
6832   Type *ValTy = getMemInstValueType(I);
6833   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6834   const Align Alignment = getLoadStoreAlignment(I);
6835   const Value *Ptr = getLoadStorePointerOperand(I);
6836 
6837   return TTI.getAddressComputationCost(VectorTy) +
6838          TTI.getGatherScatterOpCost(
6839              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6840              TargetTransformInfo::TCK_RecipThroughput, I);
6841 }
6842 
6843 InstructionCost
6844 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6845                                                    ElementCount VF) {
6846   // TODO: Once we have support for interleaving with scalable vectors
6847   // we can calculate the cost properly here.
6848   if (VF.isScalable())
6849     return InstructionCost::getInvalid();
6850 
6851   Type *ValTy = getMemInstValueType(I);
6852   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6853   unsigned AS = getLoadStoreAddressSpace(I);
6854 
6855   auto Group = getInterleavedAccessGroup(I);
6856   assert(Group && "Fail to get an interleaved access group.");
6857 
6858   unsigned InterleaveFactor = Group->getFactor();
6859   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6860 
6861   // Holds the indices of existing members in an interleaved load group.
6862   // An interleaved store group doesn't need this as it doesn't allow gaps.
6863   SmallVector<unsigned, 4> Indices;
6864   if (isa<LoadInst>(I)) {
6865     for (unsigned i = 0; i < InterleaveFactor; i++)
6866       if (Group->getMember(i))
6867         Indices.push_back(i);
6868   }
6869 
6870   // Calculate the cost of the whole interleaved group.
6871   bool UseMaskForGaps =
6872       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
6873   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6874       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6875       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6876 
6877   if (Group->isReverse()) {
6878     // TODO: Add support for reversed masked interleaved access.
6879     assert(!Legal->isMaskRequired(I) &&
6880            "Reverse masked interleaved access not supported.");
6881     Cost += Group->getNumMembers() *
6882             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
6883   }
6884   return Cost;
6885 }
6886 
6887 InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
6888     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6889   // Early exit for no inloop reductions
6890   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6891     return InstructionCost::getInvalid();
6892   auto *VectorTy = cast<VectorType>(Ty);
6893 
6894   // We are looking for a pattern of, and finding the minimal acceptable cost:
6895   //  reduce(mul(ext(A), ext(B))) or
6896   //  reduce(mul(A, B)) or
6897   //  reduce(ext(A)) or
6898   //  reduce(A).
6899   // The basic idea is that we walk down the tree to do that, finding the root
6900   // reduction instruction in InLoopReductionImmediateChains. From there we find
6901   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6902   // of the components. If the reduction cost is lower then we return it for the
6903   // reduction instruction and 0 for the other instructions in the pattern. If
6904   // it is not we return an invalid cost specifying the orignal cost method
6905   // should be used.
6906   Instruction *RetI = I;
6907   if ((RetI->getOpcode() == Instruction::SExt ||
6908        RetI->getOpcode() == Instruction::ZExt)) {
6909     if (!RetI->hasOneUser())
6910       return InstructionCost::getInvalid();
6911     RetI = RetI->user_back();
6912   }
6913   if (RetI->getOpcode() == Instruction::Mul &&
6914       RetI->user_back()->getOpcode() == Instruction::Add) {
6915     if (!RetI->hasOneUser())
6916       return InstructionCost::getInvalid();
6917     RetI = RetI->user_back();
6918   }
6919 
6920   // Test if the found instruction is a reduction, and if not return an invalid
6921   // cost specifying the parent to use the original cost modelling.
6922   if (!InLoopReductionImmediateChains.count(RetI))
6923     return InstructionCost::getInvalid();
6924 
6925   // Find the reduction this chain is a part of and calculate the basic cost of
6926   // the reduction on its own.
6927   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6928   Instruction *ReductionPhi = LastChain;
6929   while (!isa<PHINode>(ReductionPhi))
6930     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6931 
6932   RecurrenceDescriptor RdxDesc =
6933       Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
6934   unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(),
6935                                                      VectorTy, false, CostKind);
6936 
6937   // Get the operand that was not the reduction chain and match it to one of the
6938   // patterns, returning the better cost if it is found.
6939   Instruction *RedOp = RetI->getOperand(1) == LastChain
6940                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6941                            : dyn_cast<Instruction>(RetI->getOperand(1));
6942 
6943   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6944 
6945   if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) &&
6946       !TheLoop->isLoopInvariant(RedOp)) {
6947     bool IsUnsigned = isa<ZExtInst>(RedOp);
6948     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6949     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6950         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6951         CostKind);
6952 
6953     unsigned ExtCost =
6954         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6955                              TTI::CastContextHint::None, CostKind, RedOp);
6956     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6957       return I == RetI ? *RedCost.getValue() : 0;
6958   } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) {
6959     Instruction *Mul = RedOp;
6960     Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0));
6961     Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1));
6962     if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) &&
6963         Op0->getOpcode() == Op1->getOpcode() &&
6964         Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6965         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6966       bool IsUnsigned = isa<ZExtInst>(Op0);
6967       auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6968       // reduce(mul(ext, ext))
6969       unsigned ExtCost =
6970           TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
6971                                TTI::CastContextHint::None, CostKind, Op0);
6972       InstructionCost MulCost =
6973           TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
6974 
6975       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6976           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6977           CostKind);
6978 
6979       if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
6980         return I == RetI ? *RedCost.getValue() : 0;
6981     } else {
6982       InstructionCost MulCost =
6983           TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
6984 
6985       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6986           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
6987           CostKind);
6988 
6989       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6990         return I == RetI ? *RedCost.getValue() : 0;
6991     }
6992   }
6993 
6994   return I == RetI ? BaseCost : InstructionCost::getInvalid();
6995 }
6996 
6997 InstructionCost
6998 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6999                                                      ElementCount VF) {
7000   // Calculate scalar cost only. Vectorization cost should be ready at this
7001   // moment.
7002   if (VF.isScalar()) {
7003     Type *ValTy = getMemInstValueType(I);
7004     const Align Alignment = getLoadStoreAlignment(I);
7005     unsigned AS = getLoadStoreAddressSpace(I);
7006 
7007     return TTI.getAddressComputationCost(ValTy) +
7008            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7009                                TTI::TCK_RecipThroughput, I);
7010   }
7011   return getWideningCost(I, VF);
7012 }
7013 
7014 LoopVectorizationCostModel::VectorizationCostTy
7015 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7016                                                ElementCount VF) {
7017   // If we know that this instruction will remain uniform, check the cost of
7018   // the scalar version.
7019   if (isUniformAfterVectorization(I, VF))
7020     VF = ElementCount::getFixed(1);
7021 
7022   if (VF.isVector() && isProfitableToScalarize(I, VF))
7023     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7024 
7025   // Forced scalars do not have any scalarization overhead.
7026   auto ForcedScalar = ForcedScalars.find(VF);
7027   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7028     auto InstSet = ForcedScalar->second;
7029     if (InstSet.count(I))
7030       return VectorizationCostTy(
7031           (getInstructionCost(I, ElementCount::getFixed(1)).first *
7032            VF.getKnownMinValue()),
7033           false);
7034   }
7035 
7036   Type *VectorTy;
7037   InstructionCost C = getInstructionCost(I, VF, VectorTy);
7038 
7039   bool TypeNotScalarized =
7040       VF.isVector() && VectorTy->isVectorTy() &&
7041       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
7042   return VectorizationCostTy(C, TypeNotScalarized);
7043 }
7044 
7045 InstructionCost
7046 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7047                                                      ElementCount VF) {
7048 
7049   if (VF.isScalable())
7050     return InstructionCost::getInvalid();
7051 
7052   if (VF.isScalar())
7053     return 0;
7054 
7055   InstructionCost Cost = 0;
7056   Type *RetTy = ToVectorTy(I->getType(), VF);
7057   if (!RetTy->isVoidTy() &&
7058       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7059     Cost += TTI.getScalarizationOverhead(
7060         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
7061         true, false);
7062 
7063   // Some targets keep addresses scalar.
7064   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7065     return Cost;
7066 
7067   // Some targets support efficient element stores.
7068   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7069     return Cost;
7070 
7071   // Collect operands to consider.
7072   CallInst *CI = dyn_cast<CallInst>(I);
7073   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
7074 
7075   // Skip operands that do not require extraction/scalarization and do not incur
7076   // any overhead.
7077   SmallVector<Type *> Tys;
7078   for (auto *V : filterExtractingOperands(Ops, VF))
7079     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7080   return Cost + TTI.getOperandsScalarizationOverhead(
7081                     filterExtractingOperands(Ops, VF), Tys);
7082 }
7083 
7084 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7085   if (VF.isScalar())
7086     return;
7087   NumPredStores = 0;
7088   for (BasicBlock *BB : TheLoop->blocks()) {
7089     // For each instruction in the old loop.
7090     for (Instruction &I : *BB) {
7091       Value *Ptr =  getLoadStorePointerOperand(&I);
7092       if (!Ptr)
7093         continue;
7094 
7095       // TODO: We should generate better code and update the cost model for
7096       // predicated uniform stores. Today they are treated as any other
7097       // predicated store (see added test cases in
7098       // invariant-store-vectorization.ll).
7099       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
7100         NumPredStores++;
7101 
7102       if (Legal->isUniformMemOp(I)) {
7103         // TODO: Avoid replicating loads and stores instead of
7104         // relying on instcombine to remove them.
7105         // Load: Scalar load + broadcast
7106         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7107         InstructionCost Cost = getUniformMemOpCost(&I, VF);
7108         setWideningDecision(&I, VF, CM_Scalarize, Cost);
7109         continue;
7110       }
7111 
7112       // We assume that widening is the best solution when possible.
7113       if (memoryInstructionCanBeWidened(&I, VF)) {
7114         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7115         int ConsecutiveStride =
7116                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
7117         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7118                "Expected consecutive stride.");
7119         InstWidening Decision =
7120             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7121         setWideningDecision(&I, VF, Decision, Cost);
7122         continue;
7123       }
7124 
7125       // Choose between Interleaving, Gather/Scatter or Scalarization.
7126       InstructionCost InterleaveCost = InstructionCost::getInvalid();
7127       unsigned NumAccesses = 1;
7128       if (isAccessInterleaved(&I)) {
7129         auto Group = getInterleavedAccessGroup(&I);
7130         assert(Group && "Fail to get an interleaved access group.");
7131 
7132         // Make one decision for the whole group.
7133         if (getWideningDecision(&I, VF) != CM_Unknown)
7134           continue;
7135 
7136         NumAccesses = Group->getNumMembers();
7137         if (interleavedAccessCanBeWidened(&I, VF))
7138           InterleaveCost = getInterleaveGroupCost(&I, VF);
7139       }
7140 
7141       InstructionCost GatherScatterCost =
7142           isLegalGatherOrScatter(&I)
7143               ? getGatherScatterCost(&I, VF) * NumAccesses
7144               : InstructionCost::getInvalid();
7145 
7146       InstructionCost ScalarizationCost =
7147           !VF.isScalable() ? getMemInstScalarizationCost(&I, VF) * NumAccesses
7148                            : InstructionCost::getInvalid();
7149 
7150       // Choose better solution for the current VF,
7151       // write down this decision and use it during vectorization.
7152       InstructionCost Cost;
7153       InstWidening Decision;
7154       if (InterleaveCost <= GatherScatterCost &&
7155           InterleaveCost < ScalarizationCost) {
7156         Decision = CM_Interleave;
7157         Cost = InterleaveCost;
7158       } else if (GatherScatterCost < ScalarizationCost) {
7159         Decision = CM_GatherScatter;
7160         Cost = GatherScatterCost;
7161       } else {
7162         assert(!VF.isScalable() &&
7163                "We cannot yet scalarise for scalable vectors");
7164         Decision = CM_Scalarize;
7165         Cost = ScalarizationCost;
7166       }
7167       // If the instructions belongs to an interleave group, the whole group
7168       // receives the same decision. The whole group receives the cost, but
7169       // the cost will actually be assigned to one instruction.
7170       if (auto Group = getInterleavedAccessGroup(&I))
7171         setWideningDecision(Group, VF, Decision, Cost);
7172       else
7173         setWideningDecision(&I, VF, Decision, Cost);
7174     }
7175   }
7176 
7177   // Make sure that any load of address and any other address computation
7178   // remains scalar unless there is gather/scatter support. This avoids
7179   // inevitable extracts into address registers, and also has the benefit of
7180   // activating LSR more, since that pass can't optimize vectorized
7181   // addresses.
7182   if (TTI.prefersVectorizedAddressing())
7183     return;
7184 
7185   // Start with all scalar pointer uses.
7186   SmallPtrSet<Instruction *, 8> AddrDefs;
7187   for (BasicBlock *BB : TheLoop->blocks())
7188     for (Instruction &I : *BB) {
7189       Instruction *PtrDef =
7190         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7191       if (PtrDef && TheLoop->contains(PtrDef) &&
7192           getWideningDecision(&I, VF) != CM_GatherScatter)
7193         AddrDefs.insert(PtrDef);
7194     }
7195 
7196   // Add all instructions used to generate the addresses.
7197   SmallVector<Instruction *, 4> Worklist;
7198   append_range(Worklist, AddrDefs);
7199   while (!Worklist.empty()) {
7200     Instruction *I = Worklist.pop_back_val();
7201     for (auto &Op : I->operands())
7202       if (auto *InstOp = dyn_cast<Instruction>(Op))
7203         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7204             AddrDefs.insert(InstOp).second)
7205           Worklist.push_back(InstOp);
7206   }
7207 
7208   for (auto *I : AddrDefs) {
7209     if (isa<LoadInst>(I)) {
7210       // Setting the desired widening decision should ideally be handled in
7211       // by cost functions, but since this involves the task of finding out
7212       // if the loaded register is involved in an address computation, it is
7213       // instead changed here when we know this is the case.
7214       InstWidening Decision = getWideningDecision(I, VF);
7215       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7216         // Scalarize a widened load of address.
7217         setWideningDecision(
7218             I, VF, CM_Scalarize,
7219             (VF.getKnownMinValue() *
7220              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7221       else if (auto Group = getInterleavedAccessGroup(I)) {
7222         // Scalarize an interleave group of address loads.
7223         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7224           if (Instruction *Member = Group->getMember(I))
7225             setWideningDecision(
7226                 Member, VF, CM_Scalarize,
7227                 (VF.getKnownMinValue() *
7228                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7229         }
7230       }
7231     } else
7232       // Make sure I gets scalarized and a cost estimate without
7233       // scalarization overhead.
7234       ForcedScalars[VF].insert(I);
7235   }
7236 }
7237 
7238 InstructionCost
7239 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7240                                                Type *&VectorTy) {
7241   Type *RetTy = I->getType();
7242   if (canTruncateToMinimalBitwidth(I, VF))
7243     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7244   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
7245   auto SE = PSE.getSE();
7246   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7247 
7248   // TODO: We need to estimate the cost of intrinsic calls.
7249   switch (I->getOpcode()) {
7250   case Instruction::GetElementPtr:
7251     // We mark this instruction as zero-cost because the cost of GEPs in
7252     // vectorized code depends on whether the corresponding memory instruction
7253     // is scalarized or not. Therefore, we handle GEPs with the memory
7254     // instruction cost.
7255     return 0;
7256   case Instruction::Br: {
7257     // In cases of scalarized and predicated instructions, there will be VF
7258     // predicated blocks in the vectorized loop. Each branch around these
7259     // blocks requires also an extract of its vector compare i1 element.
7260     bool ScalarPredicatedBB = false;
7261     BranchInst *BI = cast<BranchInst>(I);
7262     if (VF.isVector() && BI->isConditional() &&
7263         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7264          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7265       ScalarPredicatedBB = true;
7266 
7267     if (ScalarPredicatedBB) {
7268       // Return cost for branches around scalarized and predicated blocks.
7269       assert(!VF.isScalable() && "scalable vectors not yet supported.");
7270       auto *Vec_i1Ty =
7271           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7272       return (TTI.getScalarizationOverhead(
7273                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
7274                   false, true) +
7275               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
7276                VF.getKnownMinValue()));
7277     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7278       // The back-edge branch will remain, as will all scalar branches.
7279       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7280     else
7281       // This branch will be eliminated by if-conversion.
7282       return 0;
7283     // Note: We currently assume zero cost for an unconditional branch inside
7284     // a predicated block since it will become a fall-through, although we
7285     // may decide in the future to call TTI for all branches.
7286   }
7287   case Instruction::PHI: {
7288     auto *Phi = cast<PHINode>(I);
7289 
7290     // First-order recurrences are replaced by vector shuffles inside the loop.
7291     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7292     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7293       return TTI.getShuffleCost(
7294           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7295           VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7296 
7297     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7298     // converted into select instructions. We require N - 1 selects per phi
7299     // node, where N is the number of incoming values.
7300     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7301       return (Phi->getNumIncomingValues() - 1) *
7302              TTI.getCmpSelInstrCost(
7303                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7304                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7305                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7306 
7307     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7308   }
7309   case Instruction::UDiv:
7310   case Instruction::SDiv:
7311   case Instruction::URem:
7312   case Instruction::SRem:
7313     // If we have a predicated instruction, it may not be executed for each
7314     // vector lane. Get the scalarization cost and scale this amount by the
7315     // probability of executing the predicated block. If the instruction is not
7316     // predicated, we fall through to the next case.
7317     if (VF.isVector() && isScalarWithPredication(I)) {
7318       InstructionCost Cost = 0;
7319 
7320       // These instructions have a non-void type, so account for the phi nodes
7321       // that we will create. This cost is likely to be zero. The phi node
7322       // cost, if any, should be scaled by the block probability because it
7323       // models a copy at the end of each predicated block.
7324       Cost += VF.getKnownMinValue() *
7325               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7326 
7327       // The cost of the non-predicated instruction.
7328       Cost += VF.getKnownMinValue() *
7329               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7330 
7331       // The cost of insertelement and extractelement instructions needed for
7332       // scalarization.
7333       Cost += getScalarizationOverhead(I, VF);
7334 
7335       // Scale the cost by the probability of executing the predicated blocks.
7336       // This assumes the predicated block for each vector lane is equally
7337       // likely.
7338       return Cost / getReciprocalPredBlockProb();
7339     }
7340     LLVM_FALLTHROUGH;
7341   case Instruction::Add:
7342   case Instruction::FAdd:
7343   case Instruction::Sub:
7344   case Instruction::FSub:
7345   case Instruction::Mul:
7346   case Instruction::FMul:
7347   case Instruction::FDiv:
7348   case Instruction::FRem:
7349   case Instruction::Shl:
7350   case Instruction::LShr:
7351   case Instruction::AShr:
7352   case Instruction::And:
7353   case Instruction::Or:
7354   case Instruction::Xor: {
7355     // Since we will replace the stride by 1 the multiplication should go away.
7356     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7357       return 0;
7358 
7359     // Detect reduction patterns
7360     InstructionCost RedCost;
7361     if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7362             .isValid())
7363       return RedCost;
7364 
7365     // Certain instructions can be cheaper to vectorize if they have a constant
7366     // second vector operand. One example of this are shifts on x86.
7367     Value *Op2 = I->getOperand(1);
7368     TargetTransformInfo::OperandValueProperties Op2VP;
7369     TargetTransformInfo::OperandValueKind Op2VK =
7370         TTI.getOperandInfo(Op2, Op2VP);
7371     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7372       Op2VK = TargetTransformInfo::OK_UniformValue;
7373 
7374     SmallVector<const Value *, 4> Operands(I->operand_values());
7375     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7376     return N * TTI.getArithmeticInstrCost(
7377                    I->getOpcode(), VectorTy, CostKind,
7378                    TargetTransformInfo::OK_AnyValue,
7379                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7380   }
7381   case Instruction::FNeg: {
7382     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
7383     unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
7384     return N * TTI.getArithmeticInstrCost(
7385                    I->getOpcode(), VectorTy, CostKind,
7386                    TargetTransformInfo::OK_AnyValue,
7387                    TargetTransformInfo::OK_AnyValue,
7388                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
7389                    I->getOperand(0), I);
7390   }
7391   case Instruction::Select: {
7392     SelectInst *SI = cast<SelectInst>(I);
7393     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7394     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7395     Type *CondTy = SI->getCondition()->getType();
7396     if (!ScalarCond)
7397       CondTy = VectorType::get(CondTy, VF);
7398     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7399                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7400   }
7401   case Instruction::ICmp:
7402   case Instruction::FCmp: {
7403     Type *ValTy = I->getOperand(0)->getType();
7404     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7405     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7406       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7407     VectorTy = ToVectorTy(ValTy, VF);
7408     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7409                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7410   }
7411   case Instruction::Store:
7412   case Instruction::Load: {
7413     ElementCount Width = VF;
7414     if (Width.isVector()) {
7415       InstWidening Decision = getWideningDecision(I, Width);
7416       assert(Decision != CM_Unknown &&
7417              "CM decision should be taken at this point");
7418       if (Decision == CM_Scalarize)
7419         Width = ElementCount::getFixed(1);
7420     }
7421     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
7422     return getMemoryInstructionCost(I, VF);
7423   }
7424   case Instruction::ZExt:
7425   case Instruction::SExt:
7426   case Instruction::FPToUI:
7427   case Instruction::FPToSI:
7428   case Instruction::FPExt:
7429   case Instruction::PtrToInt:
7430   case Instruction::IntToPtr:
7431   case Instruction::SIToFP:
7432   case Instruction::UIToFP:
7433   case Instruction::Trunc:
7434   case Instruction::FPTrunc:
7435   case Instruction::BitCast: {
7436     // Computes the CastContextHint from a Load/Store instruction.
7437     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7438       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7439              "Expected a load or a store!");
7440 
7441       if (VF.isScalar() || !TheLoop->contains(I))
7442         return TTI::CastContextHint::Normal;
7443 
7444       switch (getWideningDecision(I, VF)) {
7445       case LoopVectorizationCostModel::CM_GatherScatter:
7446         return TTI::CastContextHint::GatherScatter;
7447       case LoopVectorizationCostModel::CM_Interleave:
7448         return TTI::CastContextHint::Interleave;
7449       case LoopVectorizationCostModel::CM_Scalarize:
7450       case LoopVectorizationCostModel::CM_Widen:
7451         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7452                                         : TTI::CastContextHint::Normal;
7453       case LoopVectorizationCostModel::CM_Widen_Reverse:
7454         return TTI::CastContextHint::Reversed;
7455       case LoopVectorizationCostModel::CM_Unknown:
7456         llvm_unreachable("Instr did not go through cost modelling?");
7457       }
7458 
7459       llvm_unreachable("Unhandled case!");
7460     };
7461 
7462     unsigned Opcode = I->getOpcode();
7463     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7464     // For Trunc, the context is the only user, which must be a StoreInst.
7465     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7466       if (I->hasOneUse())
7467         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7468           CCH = ComputeCCH(Store);
7469     }
7470     // For Z/Sext, the context is the operand, which must be a LoadInst.
7471     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7472              Opcode == Instruction::FPExt) {
7473       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7474         CCH = ComputeCCH(Load);
7475     }
7476 
7477     // We optimize the truncation of induction variables having constant
7478     // integer steps. The cost of these truncations is the same as the scalar
7479     // operation.
7480     if (isOptimizableIVTruncate(I, VF)) {
7481       auto *Trunc = cast<TruncInst>(I);
7482       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7483                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7484     }
7485 
7486     // Detect reduction patterns
7487     InstructionCost RedCost;
7488     if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7489             .isValid())
7490       return RedCost;
7491 
7492     Type *SrcScalarTy = I->getOperand(0)->getType();
7493     Type *SrcVecTy =
7494         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7495     if (canTruncateToMinimalBitwidth(I, VF)) {
7496       // This cast is going to be shrunk. This may remove the cast or it might
7497       // turn it into slightly different cast. For example, if MinBW == 16,
7498       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7499       //
7500       // Calculate the modified src and dest types.
7501       Type *MinVecTy = VectorTy;
7502       if (Opcode == Instruction::Trunc) {
7503         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7504         VectorTy =
7505             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7506       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7507         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7508         VectorTy =
7509             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7510       }
7511     }
7512 
7513     unsigned N;
7514     if (isScalarAfterVectorization(I, VF)) {
7515       assert(!VF.isScalable() && "VF is assumed to be non scalable");
7516       N = VF.getKnownMinValue();
7517     } else
7518       N = 1;
7519     return N *
7520            TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7521   }
7522   case Instruction::Call: {
7523     bool NeedToScalarize;
7524     CallInst *CI = cast<CallInst>(I);
7525     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7526     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7527       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7528       return std::min(CallCost, IntrinsicCost);
7529     }
7530     return CallCost;
7531   }
7532   case Instruction::ExtractValue:
7533     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7534   default:
7535     // The cost of executing VF copies of the scalar instruction. This opcode
7536     // is unknown. Assume that it is the same as 'mul'.
7537     return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
7538                                        Instruction::Mul, VectorTy, CostKind) +
7539            getScalarizationOverhead(I, VF);
7540   } // end of switch.
7541 }
7542 
7543 char LoopVectorize::ID = 0;
7544 
7545 static const char lv_name[] = "Loop Vectorization";
7546 
7547 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7548 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7549 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7550 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7551 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7552 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7553 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7554 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7555 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7556 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7557 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7558 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7559 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7560 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7561 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7562 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7563 
7564 namespace llvm {
7565 
7566 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7567 
7568 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7569                               bool VectorizeOnlyWhenForced) {
7570   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7571 }
7572 
7573 } // end namespace llvm
7574 
7575 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7576   // Check if the pointer operand of a load or store instruction is
7577   // consecutive.
7578   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7579     return Legal->isConsecutivePtr(Ptr);
7580   return false;
7581 }
7582 
7583 void LoopVectorizationCostModel::collectValuesToIgnore() {
7584   // Ignore ephemeral values.
7585   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7586 
7587   // Ignore type-promoting instructions we identified during reduction
7588   // detection.
7589   for (auto &Reduction : Legal->getReductionVars()) {
7590     RecurrenceDescriptor &RedDes = Reduction.second;
7591     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7592     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7593   }
7594   // Ignore type-casting instructions we identified during induction
7595   // detection.
7596   for (auto &Induction : Legal->getInductionVars()) {
7597     InductionDescriptor &IndDes = Induction.second;
7598     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7599     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7600   }
7601 }
7602 
7603 void LoopVectorizationCostModel::collectInLoopReductions() {
7604   for (auto &Reduction : Legal->getReductionVars()) {
7605     PHINode *Phi = Reduction.first;
7606     RecurrenceDescriptor &RdxDesc = Reduction.second;
7607 
7608     // We don't collect reductions that are type promoted (yet).
7609     if (RdxDesc.getRecurrenceType() != Phi->getType())
7610       continue;
7611 
7612     // If the target would prefer this reduction to happen "in-loop", then we
7613     // want to record it as such.
7614     unsigned Opcode = RdxDesc.getOpcode();
7615     if (!PreferInLoopReductions &&
7616         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7617                                    TargetTransformInfo::ReductionFlags()))
7618       continue;
7619 
7620     // Check that we can correctly put the reductions into the loop, by
7621     // finding the chain of operations that leads from the phi to the loop
7622     // exit value.
7623     SmallVector<Instruction *, 4> ReductionOperations =
7624         RdxDesc.getReductionOpChain(Phi, TheLoop);
7625     bool InLoop = !ReductionOperations.empty();
7626     if (InLoop) {
7627       InLoopReductionChains[Phi] = ReductionOperations;
7628       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7629       Instruction *LastChain = Phi;
7630       for (auto *I : ReductionOperations) {
7631         InLoopReductionImmediateChains[I] = LastChain;
7632         LastChain = I;
7633       }
7634     }
7635     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7636                       << " reduction for phi: " << *Phi << "\n");
7637   }
7638 }
7639 
7640 // TODO: we could return a pair of values that specify the max VF and
7641 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7642 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7643 // doesn't have a cost model that can choose which plan to execute if
7644 // more than one is generated.
7645 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7646                                  LoopVectorizationCostModel &CM) {
7647   unsigned WidestType;
7648   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7649   return WidestVectorRegBits / WidestType;
7650 }
7651 
7652 VectorizationFactor
7653 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7654   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7655   ElementCount VF = UserVF;
7656   // Outer loop handling: They may require CFG and instruction level
7657   // transformations before even evaluating whether vectorization is profitable.
7658   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7659   // the vectorization pipeline.
7660   if (!OrigLoop->isInnermost()) {
7661     // If the user doesn't provide a vectorization factor, determine a
7662     // reasonable one.
7663     if (UserVF.isZero()) {
7664       VF = ElementCount::getFixed(
7665           determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
7666       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7667 
7668       // Make sure we have a VF > 1 for stress testing.
7669       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7670         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7671                           << "overriding computed VF.\n");
7672         VF = ElementCount::getFixed(4);
7673       }
7674     }
7675     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7676     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7677            "VF needs to be a power of two");
7678     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7679                       << "VF " << VF << " to build VPlans.\n");
7680     buildVPlans(VF, VF);
7681 
7682     // For VPlan build stress testing, we bail out after VPlan construction.
7683     if (VPlanBuildStressTest)
7684       return VectorizationFactor::Disabled();
7685 
7686     return {VF, 0 /*Cost*/};
7687   }
7688 
7689   LLVM_DEBUG(
7690       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7691                 "VPlan-native path.\n");
7692   return VectorizationFactor::Disabled();
7693 }
7694 
7695 Optional<VectorizationFactor>
7696 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7697   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7698   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
7699   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
7700     return None;
7701 
7702   // Invalidate interleave groups if all blocks of loop will be predicated.
7703   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7704       !useMaskedInterleavedAccesses(*TTI)) {
7705     LLVM_DEBUG(
7706         dbgs()
7707         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7708            "which requires masked-interleaved support.\n");
7709     if (CM.InterleaveInfo.invalidateGroups())
7710       // Invalidating interleave groups also requires invalidating all decisions
7711       // based on them, which includes widening decisions and uniform and scalar
7712       // values.
7713       CM.invalidateCostModelingDecisions();
7714   }
7715 
7716   ElementCount MaxVF = MaybeMaxVF.getValue();
7717   assert(MaxVF.isNonZero() && "MaxVF is zero.");
7718 
7719   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
7720   if (!UserVF.isZero() &&
7721       (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
7722     // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
7723     // VFs here, this should be reverted to only use legal UserVFs once the
7724     // loop below supports scalable VFs.
7725     ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
7726     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
7727                       << " VF " << VF << ".\n");
7728     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7729            "VF needs to be a power of two");
7730     // Collect the instructions (and their associated costs) that will be more
7731     // profitable to scalarize.
7732     CM.selectUserVectorizationFactor(VF);
7733     CM.collectInLoopReductions();
7734     buildVPlansWithVPRecipes(VF, VF);
7735     LLVM_DEBUG(printPlans(dbgs()));
7736     return {{VF, 0}};
7737   }
7738 
7739   assert(!MaxVF.isScalable() &&
7740          "Scalable vectors not yet supported beyond this point");
7741 
7742   for (ElementCount VF = ElementCount::getFixed(1);
7743        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
7744     // Collect Uniform and Scalar instructions after vectorization with VF.
7745     CM.collectUniformsAndScalars(VF);
7746 
7747     // Collect the instructions (and their associated costs) that will be more
7748     // profitable to scalarize.
7749     if (VF.isVector())
7750       CM.collectInstsToScalarize(VF);
7751   }
7752 
7753   CM.collectInLoopReductions();
7754 
7755   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
7756   LLVM_DEBUG(printPlans(dbgs()));
7757   if (MaxVF.isScalar())
7758     return VectorizationFactor::Disabled();
7759 
7760   // Select the optimal vectorization factor.
7761   return CM.selectVectorizationFactor(MaxVF);
7762 }
7763 
7764 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
7765   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
7766                     << '\n');
7767   BestVF = VF;
7768   BestUF = UF;
7769 
7770   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
7771     return !Plan->hasVF(VF);
7772   });
7773   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
7774 }
7775 
7776 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
7777                                            DominatorTree *DT) {
7778   // Perform the actual loop transformation.
7779 
7780   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7781   assert(BestVF.hasValue() && "Vectorization Factor is missing");
7782   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
7783 
7784   VPTransformState State{
7785       *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()};
7786   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7787   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7788   State.CanonicalIV = ILV.Induction;
7789 
7790   ILV.printDebugTracesAtStart();
7791 
7792   //===------------------------------------------------===//
7793   //
7794   // Notice: any optimization or new instruction that go
7795   // into the code below should also be implemented in
7796   // the cost-model.
7797   //
7798   //===------------------------------------------------===//
7799 
7800   // 2. Copy and widen instructions from the old loop into the new loop.
7801   VPlans.front()->execute(&State);
7802 
7803   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7804   //    predication, updating analyses.
7805   ILV.fixVectorizedLoop(State);
7806 
7807   ILV.printDebugTracesAtEnd();
7808 }
7809 
7810 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7811     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7812 
7813   // We create new control-flow for the vectorized loop, so the original exit
7814   // conditions will be dead after vectorization if it's only used by the
7815   // terminator
7816   SmallVector<BasicBlock*> ExitingBlocks;
7817   OrigLoop->getExitingBlocks(ExitingBlocks);
7818   for (auto *BB : ExitingBlocks) {
7819     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7820     if (!Cmp || !Cmp->hasOneUse())
7821       continue;
7822 
7823     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7824     if (!DeadInstructions.insert(Cmp).second)
7825       continue;
7826 
7827     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7828     // TODO: can recurse through operands in general
7829     for (Value *Op : Cmp->operands()) {
7830       if (isa<TruncInst>(Op) && Op->hasOneUse())
7831           DeadInstructions.insert(cast<Instruction>(Op));
7832     }
7833   }
7834 
7835   // We create new "steps" for induction variable updates to which the original
7836   // induction variables map. An original update instruction will be dead if
7837   // all its users except the induction variable are dead.
7838   auto *Latch = OrigLoop->getLoopLatch();
7839   for (auto &Induction : Legal->getInductionVars()) {
7840     PHINode *Ind = Induction.first;
7841     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7842 
7843     // If the tail is to be folded by masking, the primary induction variable,
7844     // if exists, isn't dead: it will be used for masking. Don't kill it.
7845     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7846       continue;
7847 
7848     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7849           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7850         }))
7851       DeadInstructions.insert(IndUpdate);
7852 
7853     // We record as "Dead" also the type-casting instructions we had identified
7854     // during induction analysis. We don't need any handling for them in the
7855     // vectorized loop because we have proven that, under a proper runtime
7856     // test guarding the vectorized loop, the value of the phi, and the casted
7857     // value of the phi, are the same. The last instruction in this casting chain
7858     // will get its scalar/vector/widened def from the scalar/vector/widened def
7859     // of the respective phi node. Any other casts in the induction def-use chain
7860     // have no other uses outside the phi update chain, and will be ignored.
7861     InductionDescriptor &IndDes = Induction.second;
7862     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7863     DeadInstructions.insert(Casts.begin(), Casts.end());
7864   }
7865 }
7866 
7867 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
7868 
7869 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7870 
7871 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
7872                                         Instruction::BinaryOps BinOp) {
7873   // When unrolling and the VF is 1, we only need to add a simple scalar.
7874   Type *Ty = Val->getType();
7875   assert(!Ty->isVectorTy() && "Val must be a scalar");
7876 
7877   if (Ty->isFloatingPointTy()) {
7878     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
7879 
7880     // Floating-point operations inherit FMF via the builder's flags.
7881     Value *MulOp = Builder.CreateFMul(C, Step);
7882     return Builder.CreateBinOp(BinOp, Val, MulOp);
7883   }
7884   Constant *C = ConstantInt::get(Ty, StartIdx);
7885   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
7886 }
7887 
7888 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7889   SmallVector<Metadata *, 4> MDs;
7890   // Reserve first location for self reference to the LoopID metadata node.
7891   MDs.push_back(nullptr);
7892   bool IsUnrollMetadata = false;
7893   MDNode *LoopID = L->getLoopID();
7894   if (LoopID) {
7895     // First find existing loop unrolling disable metadata.
7896     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7897       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7898       if (MD) {
7899         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7900         IsUnrollMetadata =
7901             S && S->getString().startswith("llvm.loop.unroll.disable");
7902       }
7903       MDs.push_back(LoopID->getOperand(i));
7904     }
7905   }
7906 
7907   if (!IsUnrollMetadata) {
7908     // Add runtime unroll disable metadata.
7909     LLVMContext &Context = L->getHeader()->getContext();
7910     SmallVector<Metadata *, 1> DisableOperands;
7911     DisableOperands.push_back(
7912         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7913     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7914     MDs.push_back(DisableNode);
7915     MDNode *NewLoopID = MDNode::get(Context, MDs);
7916     // Set operand 0 to refer to the loop id itself.
7917     NewLoopID->replaceOperandWith(0, NewLoopID);
7918     L->setLoopID(NewLoopID);
7919   }
7920 }
7921 
7922 //===--------------------------------------------------------------------===//
7923 // EpilogueVectorizerMainLoop
7924 //===--------------------------------------------------------------------===//
7925 
7926 /// This function is partially responsible for generating the control flow
7927 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7928 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7929   MDNode *OrigLoopID = OrigLoop->getLoopID();
7930   Loop *Lp = createVectorLoopSkeleton("");
7931 
7932   // Generate the code to check the minimum iteration count of the vector
7933   // epilogue (see below).
7934   EPI.EpilogueIterationCountCheck =
7935       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
7936   EPI.EpilogueIterationCountCheck->setName("iter.check");
7937 
7938   // Generate the code to check any assumptions that we've made for SCEV
7939   // expressions.
7940   EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
7941 
7942   // Generate the code that checks at runtime if arrays overlap. We put the
7943   // checks into a separate block to make the more common case of few elements
7944   // faster.
7945   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
7946 
7947   // Generate the iteration count check for the main loop, *after* the check
7948   // for the epilogue loop, so that the path-length is shorter for the case
7949   // that goes directly through the vector epilogue. The longer-path length for
7950   // the main loop is compensated for, by the gain from vectorizing the larger
7951   // trip count. Note: the branch will get updated later on when we vectorize
7952   // the epilogue.
7953   EPI.MainLoopIterationCountCheck =
7954       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
7955 
7956   // Generate the induction variable.
7957   OldInduction = Legal->getPrimaryInduction();
7958   Type *IdxTy = Legal->getWidestInductionType();
7959   Value *StartIdx = ConstantInt::get(IdxTy, 0);
7960   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
7961   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7962   EPI.VectorTripCount = CountRoundDown;
7963   Induction =
7964       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
7965                               getDebugLocFromInstOrOperands(OldInduction));
7966 
7967   // Skip induction resume value creation here because they will be created in
7968   // the second pass. If we created them here, they wouldn't be used anyway,
7969   // because the vplan in the second pass still contains the inductions from the
7970   // original loop.
7971 
7972   return completeLoopSkeleton(Lp, OrigLoopID);
7973 }
7974 
7975 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7976   LLVM_DEBUG({
7977     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7978            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
7979            << ", Main Loop UF:" << EPI.MainLoopUF
7980            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
7981            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7982   });
7983 }
7984 
7985 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7986   DEBUG_WITH_TYPE(VerboseDebug, {
7987     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
7988   });
7989 }
7990 
7991 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
7992     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
7993   assert(L && "Expected valid Loop.");
7994   assert(Bypass && "Expected valid bypass basic block.");
7995   unsigned VFactor =
7996       ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
7997   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7998   Value *Count = getOrCreateTripCount(L);
7999   // Reuse existing vector loop preheader for TC checks.
8000   // Note that new preheader block is generated for vector loop.
8001   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8002   IRBuilder<> Builder(TCCheckBlock->getTerminator());
8003 
8004   // Generate code to check if the loop's trip count is less than VF * UF of the
8005   // main vector loop.
8006   auto P =
8007       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8008 
8009   Value *CheckMinIters = Builder.CreateICmp(
8010       P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
8011       "min.iters.check");
8012 
8013   if (!ForEpilogue)
8014     TCCheckBlock->setName("vector.main.loop.iter.check");
8015 
8016   // Create new preheader for vector loop.
8017   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8018                                    DT, LI, nullptr, "vector.ph");
8019 
8020   if (ForEpilogue) {
8021     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8022                                  DT->getNode(Bypass)->getIDom()) &&
8023            "TC check is expected to dominate Bypass");
8024 
8025     // Update dominator for Bypass & LoopExit.
8026     DT->changeImmediateDominator(Bypass, TCCheckBlock);
8027     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8028 
8029     LoopBypassBlocks.push_back(TCCheckBlock);
8030 
8031     // Save the trip count so we don't have to regenerate it in the
8032     // vec.epilog.iter.check. This is safe to do because the trip count
8033     // generated here dominates the vector epilog iter check.
8034     EPI.TripCount = Count;
8035   }
8036 
8037   ReplaceInstWithInst(
8038       TCCheckBlock->getTerminator(),
8039       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8040 
8041   return TCCheckBlock;
8042 }
8043 
8044 //===--------------------------------------------------------------------===//
8045 // EpilogueVectorizerEpilogueLoop
8046 //===--------------------------------------------------------------------===//
8047 
8048 /// This function is partially responsible for generating the control flow
8049 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8050 BasicBlock *
8051 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8052   MDNode *OrigLoopID = OrigLoop->getLoopID();
8053   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8054 
8055   // Now, compare the remaining count and if there aren't enough iterations to
8056   // execute the vectorized epilogue skip to the scalar part.
8057   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8058   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8059   LoopVectorPreHeader =
8060       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8061                  LI, nullptr, "vec.epilog.ph");
8062   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8063                                           VecEpilogueIterationCountCheck);
8064 
8065   // Adjust the control flow taking the state info from the main loop
8066   // vectorization into account.
8067   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8068          "expected this to be saved from the previous pass.");
8069   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8070       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8071 
8072   DT->changeImmediateDominator(LoopVectorPreHeader,
8073                                EPI.MainLoopIterationCountCheck);
8074 
8075   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8076       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8077 
8078   if (EPI.SCEVSafetyCheck)
8079     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8080         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8081   if (EPI.MemSafetyCheck)
8082     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8083         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8084 
8085   DT->changeImmediateDominator(
8086       VecEpilogueIterationCountCheck,
8087       VecEpilogueIterationCountCheck->getSinglePredecessor());
8088 
8089   DT->changeImmediateDominator(LoopScalarPreHeader,
8090                                EPI.EpilogueIterationCountCheck);
8091   DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
8092 
8093   // Keep track of bypass blocks, as they feed start values to the induction
8094   // phis in the scalar loop preheader.
8095   if (EPI.SCEVSafetyCheck)
8096     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8097   if (EPI.MemSafetyCheck)
8098     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8099   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8100 
8101   // Generate a resume induction for the vector epilogue and put it in the
8102   // vector epilogue preheader
8103   Type *IdxTy = Legal->getWidestInductionType();
8104   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8105                                          LoopVectorPreHeader->getFirstNonPHI());
8106   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8107   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8108                            EPI.MainLoopIterationCountCheck);
8109 
8110   // Generate the induction variable.
8111   OldInduction = Legal->getPrimaryInduction();
8112   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8113   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8114   Value *StartIdx = EPResumeVal;
8115   Induction =
8116       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8117                               getDebugLocFromInstOrOperands(OldInduction));
8118 
8119   // Generate induction resume values. These variables save the new starting
8120   // indexes for the scalar loop. They are used to test if there are any tail
8121   // iterations left once the vector loop has completed.
8122   // Note that when the vectorized epilogue is skipped due to iteration count
8123   // check, then the resume value for the induction variable comes from
8124   // the trip count of the main vector loop, hence passing the AdditionalBypass
8125   // argument.
8126   createInductionResumeValues(Lp, CountRoundDown,
8127                               {VecEpilogueIterationCountCheck,
8128                                EPI.VectorTripCount} /* AdditionalBypass */);
8129 
8130   AddRuntimeUnrollDisableMetaData(Lp);
8131   return completeLoopSkeleton(Lp, OrigLoopID);
8132 }
8133 
8134 BasicBlock *
8135 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8136     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8137 
8138   assert(EPI.TripCount &&
8139          "Expected trip count to have been safed in the first pass.");
8140   assert(
8141       (!isa<Instruction>(EPI.TripCount) ||
8142        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8143       "saved trip count does not dominate insertion point.");
8144   Value *TC = EPI.TripCount;
8145   IRBuilder<> Builder(Insert->getTerminator());
8146   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8147 
8148   // Generate code to check if the loop's trip count is less than VF * UF of the
8149   // vector epilogue loop.
8150   auto P =
8151       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8152 
8153   Value *CheckMinIters = Builder.CreateICmp(
8154       P, Count,
8155       ConstantInt::get(Count->getType(),
8156                        EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
8157       "min.epilog.iters.check");
8158 
8159   ReplaceInstWithInst(
8160       Insert->getTerminator(),
8161       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8162 
8163   LoopBypassBlocks.push_back(Insert);
8164   return Insert;
8165 }
8166 
8167 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8168   LLVM_DEBUG({
8169     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8170            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
8171            << ", Main Loop UF:" << EPI.MainLoopUF
8172            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
8173            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8174   });
8175 }
8176 
8177 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8178   DEBUG_WITH_TYPE(VerboseDebug, {
8179     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
8180   });
8181 }
8182 
8183 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8184     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8185   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8186   bool PredicateAtRangeStart = Predicate(Range.Start);
8187 
8188   for (ElementCount TmpVF = Range.Start * 2;
8189        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8190     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8191       Range.End = TmpVF;
8192       break;
8193     }
8194 
8195   return PredicateAtRangeStart;
8196 }
8197 
8198 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8199 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8200 /// of VF's starting at a given VF and extending it as much as possible. Each
8201 /// vectorization decision can potentially shorten this sub-range during
8202 /// buildVPlan().
8203 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8204                                            ElementCount MaxVF) {
8205   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8206   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8207     VFRange SubRange = {VF, MaxVFPlusOne};
8208     VPlans.push_back(buildVPlan(SubRange));
8209     VF = SubRange.End;
8210   }
8211 }
8212 
8213 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8214                                          VPlanPtr &Plan) {
8215   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8216 
8217   // Look for cached value.
8218   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8219   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8220   if (ECEntryIt != EdgeMaskCache.end())
8221     return ECEntryIt->second;
8222 
8223   VPValue *SrcMask = createBlockInMask(Src, Plan);
8224 
8225   // The terminator has to be a branch inst!
8226   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8227   assert(BI && "Unexpected terminator found");
8228 
8229   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8230     return EdgeMaskCache[Edge] = SrcMask;
8231 
8232   // If source is an exiting block, we know the exit edge is dynamically dead
8233   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8234   // adding uses of an otherwise potentially dead instruction.
8235   if (OrigLoop->isLoopExiting(Src))
8236     return EdgeMaskCache[Edge] = SrcMask;
8237 
8238   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8239   assert(EdgeMask && "No Edge Mask found for condition");
8240 
8241   if (BI->getSuccessor(0) != Dst)
8242     EdgeMask = Builder.createNot(EdgeMask);
8243 
8244   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8245     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8246     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8247     // The select version does not introduce new UB if SrcMask is false and
8248     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8249     VPValue *False = Plan->getOrAddVPValue(
8250         ConstantInt::getFalse(BI->getCondition()->getType()));
8251     EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
8252   }
8253 
8254   return EdgeMaskCache[Edge] = EdgeMask;
8255 }
8256 
8257 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8258   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8259 
8260   // Look for cached value.
8261   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8262   if (BCEntryIt != BlockMaskCache.end())
8263     return BCEntryIt->second;
8264 
8265   // All-one mask is modelled as no-mask following the convention for masked
8266   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8267   VPValue *BlockMask = nullptr;
8268 
8269   if (OrigLoop->getHeader() == BB) {
8270     if (!CM.blockNeedsPredication(BB))
8271       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8272 
8273     // Create the block in mask as the first non-phi instruction in the block.
8274     VPBuilder::InsertPointGuard Guard(Builder);
8275     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8276     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8277 
8278     // Introduce the early-exit compare IV <= BTC to form header block mask.
8279     // This is used instead of IV < TC because TC may wrap, unlike BTC.
8280     // Start by constructing the desired canonical IV.
8281     VPValue *IV = nullptr;
8282     if (Legal->getPrimaryInduction())
8283       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8284     else {
8285       auto IVRecipe = new VPWidenCanonicalIVRecipe();
8286       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
8287       IV = IVRecipe->getVPValue();
8288     }
8289     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8290     bool TailFolded = !CM.isScalarEpilogueAllowed();
8291 
8292     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8293       // While ActiveLaneMask is a binary op that consumes the loop tripcount
8294       // as a second argument, we only pass the IV here and extract the
8295       // tripcount from the transform state where codegen of the VP instructions
8296       // happen.
8297       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8298     } else {
8299       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8300     }
8301     return BlockMaskCache[BB] = BlockMask;
8302   }
8303 
8304   // This is the block mask. We OR all incoming edges.
8305   for (auto *Predecessor : predecessors(BB)) {
8306     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8307     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8308       return BlockMaskCache[BB] = EdgeMask;
8309 
8310     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8311       BlockMask = EdgeMask;
8312       continue;
8313     }
8314 
8315     BlockMask = Builder.createOr(BlockMask, EdgeMask);
8316   }
8317 
8318   return BlockMaskCache[BB] = BlockMask;
8319 }
8320 
8321 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
8322                                                 VPlanPtr &Plan) {
8323   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8324          "Must be called with either a load or store");
8325 
8326   auto willWiden = [&](ElementCount VF) -> bool {
8327     if (VF.isScalar())
8328       return false;
8329     LoopVectorizationCostModel::InstWidening Decision =
8330         CM.getWideningDecision(I, VF);
8331     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8332            "CM decision should be taken at this point.");
8333     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8334       return true;
8335     if (CM.isScalarAfterVectorization(I, VF) ||
8336         CM.isProfitableToScalarize(I, VF))
8337       return false;
8338     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8339   };
8340 
8341   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8342     return nullptr;
8343 
8344   VPValue *Mask = nullptr;
8345   if (Legal->isMaskRequired(I))
8346     Mask = createBlockInMask(I->getParent(), Plan);
8347 
8348   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
8349   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8350     return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
8351 
8352   StoreInst *Store = cast<StoreInst>(I);
8353   VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
8354   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
8355 }
8356 
8357 VPWidenIntOrFpInductionRecipe *
8358 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const {
8359   // Check if this is an integer or fp induction. If so, build the recipe that
8360   // produces its scalar and vector values.
8361   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8362   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8363       II.getKind() == InductionDescriptor::IK_FpInduction) {
8364     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8365     const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts();
8366     return new VPWidenIntOrFpInductionRecipe(
8367         Phi, Start, Casts.empty() ? nullptr : Casts.front());
8368   }
8369 
8370   return nullptr;
8371 }
8372 
8373 VPWidenIntOrFpInductionRecipe *
8374 VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
8375                                                 VPlan &Plan) const {
8376   // Optimize the special case where the source is a constant integer
8377   // induction variable. Notice that we can only optimize the 'trunc' case
8378   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8379   // (c) other casts depend on pointer size.
8380 
8381   // Determine whether \p K is a truncation based on an induction variable that
8382   // can be optimized.
8383   auto isOptimizableIVTruncate =
8384       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8385     return [=](ElementCount VF) -> bool {
8386       return CM.isOptimizableIVTruncate(K, VF);
8387     };
8388   };
8389 
8390   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8391           isOptimizableIVTruncate(I), Range)) {
8392 
8393     InductionDescriptor II =
8394         Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
8395     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8396     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8397                                              Start, nullptr, I);
8398   }
8399   return nullptr;
8400 }
8401 
8402 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
8403   // If all incoming values are equal, the incoming VPValue can be used directly
8404   // instead of creating a new VPBlendRecipe.
8405   Value *FirstIncoming = Phi->getIncomingValue(0);
8406   if (all_of(Phi->incoming_values(), [FirstIncoming](const Value *Inc) {
8407         return FirstIncoming == Inc;
8408       })) {
8409     return Plan->getOrAddVPValue(Phi->getIncomingValue(0));
8410   }
8411 
8412   // We know that all PHIs in non-header blocks are converted into selects, so
8413   // we don't have to worry about the insertion order and we can just use the
8414   // builder. At this point we generate the predication tree. There may be
8415   // duplications since this is a simple recursive scan, but future
8416   // optimizations will clean it up.
8417   SmallVector<VPValue *, 2> Operands;
8418   unsigned NumIncoming = Phi->getNumIncomingValues();
8419 
8420   for (unsigned In = 0; In < NumIncoming; In++) {
8421     VPValue *EdgeMask =
8422       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8423     assert((EdgeMask || NumIncoming == 1) &&
8424            "Multiple predecessors with one having a full mask");
8425     Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
8426     if (EdgeMask)
8427       Operands.push_back(EdgeMask);
8428   }
8429   return toVPRecipeResult(new VPBlendRecipe(Phi, Operands));
8430 }
8431 
8432 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
8433                                                    VPlan &Plan) const {
8434 
8435   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8436       [this, CI](ElementCount VF) {
8437         return CM.isScalarWithPredication(CI, VF);
8438       },
8439       Range);
8440 
8441   if (IsPredicated)
8442     return nullptr;
8443 
8444   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8445   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8446              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8447              ID == Intrinsic::pseudoprobe ||
8448              ID == Intrinsic::experimental_noalias_scope_decl))
8449     return nullptr;
8450 
8451   auto willWiden = [&](ElementCount VF) -> bool {
8452     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8453     // The following case may be scalarized depending on the VF.
8454     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8455     // version of the instruction.
8456     // Is it beneficial to perform intrinsic call compared to lib call?
8457     bool NeedToScalarize = false;
8458     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8459     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8460     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8461     assert(IntrinsicCost.isValid() && CallCost.isValid() &&
8462            "Cannot have invalid costs while widening");
8463     return UseVectorIntrinsic || !NeedToScalarize;
8464   };
8465 
8466   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8467     return nullptr;
8468 
8469   return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
8470 }
8471 
8472 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8473   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8474          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8475   // Instruction should be widened, unless it is scalar after vectorization,
8476   // scalarization is profitable or it is predicated.
8477   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8478     return CM.isScalarAfterVectorization(I, VF) ||
8479            CM.isProfitableToScalarize(I, VF) ||
8480            CM.isScalarWithPredication(I, VF);
8481   };
8482   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8483                                                              Range);
8484 }
8485 
8486 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
8487   auto IsVectorizableOpcode = [](unsigned Opcode) {
8488     switch (Opcode) {
8489     case Instruction::Add:
8490     case Instruction::And:
8491     case Instruction::AShr:
8492     case Instruction::BitCast:
8493     case Instruction::FAdd:
8494     case Instruction::FCmp:
8495     case Instruction::FDiv:
8496     case Instruction::FMul:
8497     case Instruction::FNeg:
8498     case Instruction::FPExt:
8499     case Instruction::FPToSI:
8500     case Instruction::FPToUI:
8501     case Instruction::FPTrunc:
8502     case Instruction::FRem:
8503     case Instruction::FSub:
8504     case Instruction::ICmp:
8505     case Instruction::IntToPtr:
8506     case Instruction::LShr:
8507     case Instruction::Mul:
8508     case Instruction::Or:
8509     case Instruction::PtrToInt:
8510     case Instruction::SDiv:
8511     case Instruction::Select:
8512     case Instruction::SExt:
8513     case Instruction::Shl:
8514     case Instruction::SIToFP:
8515     case Instruction::SRem:
8516     case Instruction::Sub:
8517     case Instruction::Trunc:
8518     case Instruction::UDiv:
8519     case Instruction::UIToFP:
8520     case Instruction::URem:
8521     case Instruction::Xor:
8522     case Instruction::ZExt:
8523       return true;
8524     }
8525     return false;
8526   };
8527 
8528   if (!IsVectorizableOpcode(I->getOpcode()))
8529     return nullptr;
8530 
8531   // Success: widen this instruction.
8532   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
8533 }
8534 
8535 VPBasicBlock *VPRecipeBuilder::handleReplication(
8536     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8537     VPlanPtr &Plan) {
8538   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8539       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8540       Range);
8541 
8542   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8543       [&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
8544       Range);
8545 
8546   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8547                                        IsUniform, IsPredicated);
8548   setRecipe(I, Recipe);
8549   Plan->addVPValue(I, Recipe);
8550 
8551   // Find if I uses a predicated instruction. If so, it will use its scalar
8552   // value. Avoid hoisting the insert-element which packs the scalar value into
8553   // a vector value, as that happens iff all users use the vector value.
8554   for (VPValue *Op : Recipe->operands()) {
8555     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8556     if (!PredR)
8557       continue;
8558     auto *RepR =
8559         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8560     assert(RepR->isPredicated() &&
8561            "expected Replicate recipe to be predicated");
8562     RepR->setAlsoPack(false);
8563   }
8564 
8565   // Finalize the recipe for Instr, first if it is not predicated.
8566   if (!IsPredicated) {
8567     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8568     VPBB->appendRecipe(Recipe);
8569     return VPBB;
8570   }
8571   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8572   assert(VPBB->getSuccessors().empty() &&
8573          "VPBB has successors when handling predicated replication.");
8574   // Record predicated instructions for above packing optimizations.
8575   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8576   VPBlockUtils::insertBlockAfter(Region, VPBB);
8577   auto *RegSucc = new VPBasicBlock();
8578   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8579   return RegSucc;
8580 }
8581 
8582 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8583                                                       VPRecipeBase *PredRecipe,
8584                                                       VPlanPtr &Plan) {
8585   // Instructions marked for predication are replicated and placed under an
8586   // if-then construct to prevent side-effects.
8587 
8588   // Generate recipes to compute the block mask for this region.
8589   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8590 
8591   // Build the triangular if-then region.
8592   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8593   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8594   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8595   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8596   auto *PHIRecipe = Instr->getType()->isVoidTy()
8597                         ? nullptr
8598                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8599   if (PHIRecipe) {
8600     Plan->removeVPValueFor(Instr);
8601     Plan->addVPValue(Instr, PHIRecipe);
8602   }
8603   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8604   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8605   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8606 
8607   // Note: first set Entry as region entry and then connect successors starting
8608   // from it in order, to propagate the "parent" of each VPBasicBlock.
8609   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8610   VPBlockUtils::connectBlocks(Pred, Exit);
8611 
8612   return Region;
8613 }
8614 
8615 VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8616                                                             VFRange &Range,
8617                                                             VPlanPtr &Plan) {
8618   // First, check for specific widening recipes that deal with calls, memory
8619   // operations, inductions and Phi nodes.
8620   if (auto *CI = dyn_cast<CallInst>(Instr))
8621     return toVPRecipeResult(tryToWidenCall(CI, Range, *Plan));
8622 
8623   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8624     return toVPRecipeResult(tryToWidenMemory(Instr, Range, Plan));
8625 
8626   VPRecipeBase *Recipe;
8627   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8628     if (Phi->getParent() != OrigLoop->getHeader())
8629       return tryToBlend(Phi, Plan);
8630     if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan)))
8631       return toVPRecipeResult(Recipe);
8632 
8633     if (Legal->isReductionVariable(Phi)) {
8634       RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8635       VPValue *StartV =
8636           Plan->getOrAddVPValue(RdxDesc.getRecurrenceStartValue());
8637       return toVPRecipeResult(new VPWidenPHIRecipe(Phi, RdxDesc, *StartV));
8638     }
8639 
8640     return toVPRecipeResult(new VPWidenPHIRecipe(Phi));
8641   }
8642 
8643   if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8644                                     cast<TruncInst>(Instr), Range, *Plan)))
8645     return toVPRecipeResult(Recipe);
8646 
8647   if (!shouldWiden(Instr, Range))
8648     return nullptr;
8649 
8650   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8651     return toVPRecipeResult(new VPWidenGEPRecipe(
8652         GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop));
8653 
8654   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8655     bool InvariantCond =
8656         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8657     return toVPRecipeResult(new VPWidenSelectRecipe(
8658         *SI, Plan->mapToVPValues(SI->operands()), InvariantCond));
8659   }
8660 
8661   return toVPRecipeResult(tryToWiden(Instr, *Plan));
8662 }
8663 
8664 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8665                                                         ElementCount MaxVF) {
8666   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8667 
8668   // Collect instructions from the original loop that will become trivially dead
8669   // in the vectorized loop. We don't need to vectorize these instructions. For
8670   // example, original induction update instructions can become dead because we
8671   // separately emit induction "steps" when generating code for the new loop.
8672   // Similarly, we create a new latch condition when setting up the structure
8673   // of the new loop, so the old one can become dead.
8674   SmallPtrSet<Instruction *, 4> DeadInstructions;
8675   collectTriviallyDeadInstructions(DeadInstructions);
8676 
8677   // Add assume instructions we need to drop to DeadInstructions, to prevent
8678   // them from being added to the VPlan.
8679   // TODO: We only need to drop assumes in blocks that get flattend. If the
8680   // control flow is preserved, we should keep them.
8681   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8682   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8683 
8684   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8685   // Dead instructions do not need sinking. Remove them from SinkAfter.
8686   for (Instruction *I : DeadInstructions)
8687     SinkAfter.erase(I);
8688 
8689   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8690   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8691     VFRange SubRange = {VF, MaxVFPlusOne};
8692     VPlans.push_back(
8693         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8694     VF = SubRange.End;
8695   }
8696 }
8697 
8698 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8699     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8700     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
8701 
8702   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8703 
8704   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8705 
8706   // ---------------------------------------------------------------------------
8707   // Pre-construction: record ingredients whose recipes we'll need to further
8708   // process after constructing the initial VPlan.
8709   // ---------------------------------------------------------------------------
8710 
8711   // Mark instructions we'll need to sink later and their targets as
8712   // ingredients whose recipe we'll need to record.
8713   for (auto &Entry : SinkAfter) {
8714     RecipeBuilder.recordRecipeOf(Entry.first);
8715     RecipeBuilder.recordRecipeOf(Entry.second);
8716   }
8717   for (auto &Reduction : CM.getInLoopReductionChains()) {
8718     PHINode *Phi = Reduction.first;
8719     RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
8720     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8721 
8722     RecipeBuilder.recordRecipeOf(Phi);
8723     for (auto &R : ReductionOperations) {
8724       RecipeBuilder.recordRecipeOf(R);
8725       // For min/max reducitons, where we have a pair of icmp/select, we also
8726       // need to record the ICmp recipe, so it can be removed later.
8727       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8728         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8729     }
8730   }
8731 
8732   // For each interleave group which is relevant for this (possibly trimmed)
8733   // Range, add it to the set of groups to be later applied to the VPlan and add
8734   // placeholders for its members' Recipes which we'll be replacing with a
8735   // single VPInterleaveRecipe.
8736   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8737     auto applyIG = [IG, this](ElementCount VF) -> bool {
8738       return (VF.isVector() && // Query is illegal for VF == 1
8739               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8740                   LoopVectorizationCostModel::CM_Interleave);
8741     };
8742     if (!getDecisionAndClampRange(applyIG, Range))
8743       continue;
8744     InterleaveGroups.insert(IG);
8745     for (unsigned i = 0; i < IG->getFactor(); i++)
8746       if (Instruction *Member = IG->getMember(i))
8747         RecipeBuilder.recordRecipeOf(Member);
8748   };
8749 
8750   // ---------------------------------------------------------------------------
8751   // Build initial VPlan: Scan the body of the loop in a topological order to
8752   // visit each basic block after having visited its predecessor basic blocks.
8753   // ---------------------------------------------------------------------------
8754 
8755   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
8756   auto Plan = std::make_unique<VPlan>();
8757   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
8758   Plan->setEntry(VPBB);
8759 
8760   // Scan the body of the loop in a topological order to visit each basic block
8761   // after having visited its predecessor basic blocks.
8762   LoopBlocksDFS DFS(OrigLoop);
8763   DFS.perform(LI);
8764 
8765   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8766     // Relevant instructions from basic block BB will be grouped into VPRecipe
8767     // ingredients and fill a new VPBasicBlock.
8768     unsigned VPBBsForBB = 0;
8769     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
8770     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
8771     VPBB = FirstVPBBForBB;
8772     Builder.setInsertPoint(VPBB);
8773 
8774     // Introduce each ingredient into VPlan.
8775     // TODO: Model and preserve debug instrinsics in VPlan.
8776     for (Instruction &I : BB->instructionsWithoutDebug()) {
8777       Instruction *Instr = &I;
8778 
8779       // First filter out irrelevant instructions, to ensure no recipes are
8780       // built for them.
8781       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8782         continue;
8783 
8784       if (auto RecipeOrValue =
8785               RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
8786         // If Instr can be simplified to an existing VPValue, use it.
8787         if (RecipeOrValue.is<VPValue *>()) {
8788           Plan->addVPValue(Instr, RecipeOrValue.get<VPValue *>());
8789           continue;
8790         }
8791         // Otherwise, add the new recipe.
8792         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8793         for (auto *Def : Recipe->definedValues()) {
8794           auto *UV = Def->getUnderlyingValue();
8795           Plan->addVPValue(UV, Def);
8796         }
8797 
8798         RecipeBuilder.setRecipe(Instr, Recipe);
8799         VPBB->appendRecipe(Recipe);
8800         continue;
8801       }
8802 
8803       // Otherwise, if all widening options failed, Instruction is to be
8804       // replicated. This may create a successor for VPBB.
8805       VPBasicBlock *NextVPBB =
8806           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8807       if (NextVPBB != VPBB) {
8808         VPBB = NextVPBB;
8809         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8810                                     : "");
8811       }
8812     }
8813   }
8814 
8815   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
8816   // may also be empty, such as the last one VPBB, reflecting original
8817   // basic-blocks with no recipes.
8818   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
8819   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
8820   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
8821   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
8822   delete PreEntry;
8823 
8824   // ---------------------------------------------------------------------------
8825   // Transform initial VPlan: Apply previously taken decisions, in order, to
8826   // bring the VPlan to its final state.
8827   // ---------------------------------------------------------------------------
8828 
8829   // Apply Sink-After legal constraints.
8830   for (auto &Entry : SinkAfter) {
8831     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8832     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8833     // If the target is in a replication region, make sure to move Sink to the
8834     // block after it, not into the replication region itself.
8835     if (auto *Region =
8836             dyn_cast_or_null<VPRegionBlock>(Target->getParent()->getParent())) {
8837       if (Region->isReplicator()) {
8838         assert(Region->getNumSuccessors() == 1 && "Expected SESE region!");
8839         VPBasicBlock *NextBlock =
8840             cast<VPBasicBlock>(Region->getSuccessors().front());
8841         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
8842         continue;
8843       }
8844     }
8845     Sink->moveAfter(Target);
8846   }
8847 
8848   // Interleave memory: for each Interleave Group we marked earlier as relevant
8849   // for this VPlan, replace the Recipes widening its memory instructions with a
8850   // single VPInterleaveRecipe at its insertion point.
8851   for (auto IG : InterleaveGroups) {
8852     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8853         RecipeBuilder.getRecipe(IG->getInsertPos()));
8854     SmallVector<VPValue *, 4> StoredValues;
8855     for (unsigned i = 0; i < IG->getFactor(); ++i)
8856       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
8857         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
8858 
8859     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8860                                         Recipe->getMask());
8861     VPIG->insertBefore(Recipe);
8862     unsigned J = 0;
8863     for (unsigned i = 0; i < IG->getFactor(); ++i)
8864       if (Instruction *Member = IG->getMember(i)) {
8865         if (!Member->getType()->isVoidTy()) {
8866           VPValue *OriginalV = Plan->getVPValue(Member);
8867           Plan->removeVPValueFor(Member);
8868           Plan->addVPValue(Member, VPIG->getVPValue(J));
8869           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8870           J++;
8871         }
8872         RecipeBuilder.getRecipe(Member)->eraseFromParent();
8873       }
8874   }
8875 
8876   // Adjust the recipes for any inloop reductions.
8877   if (Range.Start.isVector())
8878     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
8879 
8880   // Finally, if tail is folded by masking, introduce selects between the phi
8881   // and the live-out instruction of each reduction, at the end of the latch.
8882   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
8883     Builder.setInsertPoint(VPBB);
8884     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
8885     for (auto &Reduction : Legal->getReductionVars()) {
8886       if (CM.isInLoopReduction(Reduction.first))
8887         continue;
8888       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
8889       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
8890       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
8891     }
8892   }
8893 
8894   std::string PlanName;
8895   raw_string_ostream RSO(PlanName);
8896   ElementCount VF = Range.Start;
8897   Plan->addVF(VF);
8898   RSO << "Initial VPlan for VF={" << VF;
8899   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
8900     Plan->addVF(VF);
8901     RSO << "," << VF;
8902   }
8903   RSO << "},UF>=1";
8904   RSO.flush();
8905   Plan->setName(PlanName);
8906 
8907   return Plan;
8908 }
8909 
8910 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8911   // Outer loop handling: They may require CFG and instruction level
8912   // transformations before even evaluating whether vectorization is profitable.
8913   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8914   // the vectorization pipeline.
8915   assert(!OrigLoop->isInnermost());
8916   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8917 
8918   // Create new empty VPlan
8919   auto Plan = std::make_unique<VPlan>();
8920 
8921   // Build hierarchical CFG
8922   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8923   HCFGBuilder.buildHierarchicalCFG();
8924 
8925   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
8926        VF *= 2)
8927     Plan->addVF(VF);
8928 
8929   if (EnableVPlanPredication) {
8930     VPlanPredicator VPP(*Plan);
8931     VPP.predicate();
8932 
8933     // Avoid running transformation to recipes until masked code generation in
8934     // VPlan-native path is in place.
8935     return Plan;
8936   }
8937 
8938   SmallPtrSet<Instruction *, 1> DeadInstructions;
8939   VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan,
8940                                              Legal->getInductionVars(),
8941                                              DeadInstructions, *PSE.getSE());
8942   return Plan;
8943 }
8944 
8945 // Adjust the recipes for any inloop reductions. The chain of instructions
8946 // leading from the loop exit instr to the phi need to be converted to
8947 // reductions, with one operand being vector and the other being the scalar
8948 // reduction chain.
8949 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
8950     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
8951   for (auto &Reduction : CM.getInLoopReductionChains()) {
8952     PHINode *Phi = Reduction.first;
8953     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8954     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8955 
8956     // ReductionOperations are orders top-down from the phi's use to the
8957     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
8958     // which of the two operands will remain scalar and which will be reduced.
8959     // For minmax the chain will be the select instructions.
8960     Instruction *Chain = Phi;
8961     for (Instruction *R : ReductionOperations) {
8962       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
8963       RecurKind Kind = RdxDesc.getRecurrenceKind();
8964 
8965       VPValue *ChainOp = Plan->getVPValue(Chain);
8966       unsigned FirstOpId;
8967       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8968         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
8969                "Expected to replace a VPWidenSelectSC");
8970         FirstOpId = 1;
8971       } else {
8972         assert(isa<VPWidenRecipe>(WidenRecipe) &&
8973                "Expected to replace a VPWidenSC");
8974         FirstOpId = 0;
8975       }
8976       unsigned VecOpId =
8977           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
8978       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
8979 
8980       auto *CondOp = CM.foldTailByMasking()
8981                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
8982                          : nullptr;
8983       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
8984           &RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
8985       WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
8986       Plan->removeVPValueFor(R);
8987       Plan->addVPValue(R, RedRecipe);
8988       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
8989       WidenRecipe->getVPValue()->replaceAllUsesWith(RedRecipe);
8990       WidenRecipe->eraseFromParent();
8991 
8992       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8993         VPRecipeBase *CompareRecipe =
8994             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
8995         assert(isa<VPWidenRecipe>(CompareRecipe) &&
8996                "Expected to replace a VPWidenSC");
8997         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
8998                "Expected no remaining users");
8999         CompareRecipe->eraseFromParent();
9000       }
9001       Chain = R;
9002     }
9003   }
9004 }
9005 
9006 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9007                                VPSlotTracker &SlotTracker) const {
9008   O << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9009   IG->getInsertPos()->printAsOperand(O, false);
9010   O << ", ";
9011   getAddr()->printAsOperand(O, SlotTracker);
9012   VPValue *Mask = getMask();
9013   if (Mask) {
9014     O << ", ";
9015     Mask->printAsOperand(O, SlotTracker);
9016   }
9017   for (unsigned i = 0; i < IG->getFactor(); ++i)
9018     if (Instruction *I = IG->getMember(i))
9019       O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
9020 }
9021 
9022 void VPWidenCallRecipe::execute(VPTransformState &State) {
9023   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9024                                   *this, State);
9025 }
9026 
9027 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9028   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
9029                                     this, *this, InvariantCond, State);
9030 }
9031 
9032 void VPWidenRecipe::execute(VPTransformState &State) {
9033   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
9034 }
9035 
9036 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9037   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
9038                       *this, State.UF, State.VF, IsPtrLoopInvariant,
9039                       IsIndexLoopInvariant, State);
9040 }
9041 
9042 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9043   assert(!State.Instance && "Int or FP induction being replicated.");
9044   State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
9045                                    getTruncInst(), getVPValue(0),
9046                                    getCastValue(), State);
9047 }
9048 
9049 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9050   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc,
9051                                  getStartValue(), this, State);
9052 }
9053 
9054 void VPBlendRecipe::execute(VPTransformState &State) {
9055   State.ILV->setDebugLocFromInst(State.Builder, Phi);
9056   // We know that all PHIs in non-header blocks are converted into
9057   // selects, so we don't have to worry about the insertion order and we
9058   // can just use the builder.
9059   // At this point we generate the predication tree. There may be
9060   // duplications since this is a simple recursive scan, but future
9061   // optimizations will clean it up.
9062 
9063   unsigned NumIncoming = getNumIncomingValues();
9064 
9065   // Generate a sequence of selects of the form:
9066   // SELECT(Mask3, In3,
9067   //        SELECT(Mask2, In2,
9068   //               SELECT(Mask1, In1,
9069   //                      In0)))
9070   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9071   // are essentially undef are taken from In0.
9072   InnerLoopVectorizer::VectorParts Entry(State.UF);
9073   for (unsigned In = 0; In < NumIncoming; ++In) {
9074     for (unsigned Part = 0; Part < State.UF; ++Part) {
9075       // We might have single edge PHIs (blocks) - use an identity
9076       // 'select' for the first PHI operand.
9077       Value *In0 = State.get(getIncomingValue(In), Part);
9078       if (In == 0)
9079         Entry[Part] = In0; // Initialize with the first incoming value.
9080       else {
9081         // Select between the current value and the previous incoming edge
9082         // based on the incoming mask.
9083         Value *Cond = State.get(getMask(In), Part);
9084         Entry[Part] =
9085             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9086       }
9087     }
9088   }
9089   for (unsigned Part = 0; Part < State.UF; ++Part)
9090     State.set(this, Entry[Part], Part);
9091 }
9092 
9093 void VPInterleaveRecipe::execute(VPTransformState &State) {
9094   assert(!State.Instance && "Interleave group being replicated.");
9095   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9096                                       getStoredValues(), getMask());
9097 }
9098 
9099 void VPReductionRecipe::execute(VPTransformState &State) {
9100   assert(!State.Instance && "Reduction being replicated.");
9101   for (unsigned Part = 0; Part < State.UF; ++Part) {
9102     RecurKind Kind = RdxDesc->getRecurrenceKind();
9103     Value *NewVecOp = State.get(getVecOp(), Part);
9104     if (VPValue *Cond = getCondOp()) {
9105       Value *NewCond = State.get(Cond, Part);
9106       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9107       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
9108           Kind, VecTy->getElementType());
9109       Constant *IdenVec =
9110           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
9111       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9112       NewVecOp = Select;
9113     }
9114     Value *NewRed =
9115         createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9116     Value *PrevInChain = State.get(getChainOp(), Part);
9117     Value *NextInChain;
9118     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9119       NextInChain =
9120           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9121                          NewRed, PrevInChain);
9122     } else {
9123       NextInChain = State.Builder.CreateBinOp(
9124           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
9125           PrevInChain);
9126     }
9127     State.set(this, NextInChain, Part);
9128   }
9129 }
9130 
9131 void VPReplicateRecipe::execute(VPTransformState &State) {
9132   if (State.Instance) { // Generate a single instance.
9133     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9134     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9135                                     *State.Instance, IsPredicated, State);
9136     // Insert scalar instance packing it into a vector.
9137     if (AlsoPack && State.VF.isVector()) {
9138       // If we're constructing lane 0, initialize to start from poison.
9139       if (State.Instance->Lane.isFirstLane()) {
9140         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9141         Value *Poison = PoisonValue::get(
9142             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9143         State.set(this, Poison, State.Instance->Part);
9144       }
9145       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9146     }
9147     return;
9148   }
9149 
9150   // Generate scalar instances for all VF lanes of all UF parts, unless the
9151   // instruction is uniform inwhich case generate only the first lane for each
9152   // of the UF parts.
9153   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9154   assert((!State.VF.isScalable() || IsUniform) &&
9155          "Can't scalarize a scalable vector");
9156   for (unsigned Part = 0; Part < State.UF; ++Part)
9157     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9158       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9159                                       VPIteration(Part, Lane), IsPredicated,
9160                                       State);
9161 }
9162 
9163 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9164   assert(State.Instance && "Branch on Mask works only on single instance.");
9165 
9166   unsigned Part = State.Instance->Part;
9167   unsigned Lane = State.Instance->Lane.getKnownLane();
9168 
9169   Value *ConditionBit = nullptr;
9170   VPValue *BlockInMask = getMask();
9171   if (BlockInMask) {
9172     ConditionBit = State.get(BlockInMask, Part);
9173     if (ConditionBit->getType()->isVectorTy())
9174       ConditionBit = State.Builder.CreateExtractElement(
9175           ConditionBit, State.Builder.getInt32(Lane));
9176   } else // Block in mask is all-one.
9177     ConditionBit = State.Builder.getTrue();
9178 
9179   // Replace the temporary unreachable terminator with a new conditional branch,
9180   // whose two destinations will be set later when they are created.
9181   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9182   assert(isa<UnreachableInst>(CurrentTerminator) &&
9183          "Expected to replace unreachable terminator with conditional branch.");
9184   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9185   CondBr->setSuccessor(0, nullptr);
9186   ReplaceInstWithInst(CurrentTerminator, CondBr);
9187 }
9188 
9189 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9190   assert(State.Instance && "Predicated instruction PHI works per instance.");
9191   Instruction *ScalarPredInst =
9192       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9193   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9194   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9195   assert(PredicatingBB && "Predicated block has no single predecessor.");
9196   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9197          "operand must be VPReplicateRecipe");
9198 
9199   // By current pack/unpack logic we need to generate only a single phi node: if
9200   // a vector value for the predicated instruction exists at this point it means
9201   // the instruction has vector users only, and a phi for the vector value is
9202   // needed. In this case the recipe of the predicated instruction is marked to
9203   // also do that packing, thereby "hoisting" the insert-element sequence.
9204   // Otherwise, a phi node for the scalar value is needed.
9205   unsigned Part = State.Instance->Part;
9206   if (State.hasVectorValue(getOperand(0), Part)) {
9207     Value *VectorValue = State.get(getOperand(0), Part);
9208     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9209     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9210     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9211     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9212     if (State.hasVectorValue(this, Part))
9213       State.reset(this, VPhi, Part);
9214     else
9215       State.set(this, VPhi, Part);
9216     // NOTE: Currently we need to update the value of the operand, so the next
9217     // predicated iteration inserts its generated value in the correct vector.
9218     State.reset(getOperand(0), VPhi, Part);
9219   } else {
9220     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9221     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9222     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9223                      PredicatingBB);
9224     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9225     if (State.hasScalarValue(this, *State.Instance))
9226       State.reset(this, Phi, *State.Instance);
9227     else
9228       State.set(this, Phi, *State.Instance);
9229     // NOTE: Currently we need to update the value of the operand, so the next
9230     // predicated iteration inserts its generated value in the correct vector.
9231     State.reset(getOperand(0), Phi, *State.Instance);
9232   }
9233 }
9234 
9235 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9236   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9237   State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
9238                                         StoredValue ? nullptr : getVPValue(),
9239                                         getAddr(), StoredValue, getMask());
9240 }
9241 
9242 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9243 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9244 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9245 // for predication.
9246 static ScalarEpilogueLowering getScalarEpilogueLowering(
9247     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9248     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9249     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9250     LoopVectorizationLegality &LVL) {
9251   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9252   // don't look at hints or options, and don't request a scalar epilogue.
9253   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9254   // LoopAccessInfo (due to code dependency and not being able to reliably get
9255   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9256   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9257   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9258   // back to the old way and vectorize with versioning when forced. See D81345.)
9259   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9260                                                       PGSOQueryType::IRPass) &&
9261                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9262     return CM_ScalarEpilogueNotAllowedOptSize;
9263 
9264   // 2) If set, obey the directives
9265   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9266     switch (PreferPredicateOverEpilogue) {
9267     case PreferPredicateTy::ScalarEpilogue:
9268       return CM_ScalarEpilogueAllowed;
9269     case PreferPredicateTy::PredicateElseScalarEpilogue:
9270       return CM_ScalarEpilogueNotNeededUsePredicate;
9271     case PreferPredicateTy::PredicateOrDontVectorize:
9272       return CM_ScalarEpilogueNotAllowedUsePredicate;
9273     };
9274   }
9275 
9276   // 3) If set, obey the hints
9277   switch (Hints.getPredicate()) {
9278   case LoopVectorizeHints::FK_Enabled:
9279     return CM_ScalarEpilogueNotNeededUsePredicate;
9280   case LoopVectorizeHints::FK_Disabled:
9281     return CM_ScalarEpilogueAllowed;
9282   };
9283 
9284   // 4) if the TTI hook indicates this is profitable, request predication.
9285   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
9286                                        LVL.getLAI()))
9287     return CM_ScalarEpilogueNotNeededUsePredicate;
9288 
9289   return CM_ScalarEpilogueAllowed;
9290 }
9291 
9292 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9293   // If Values have been set for this Def return the one relevant for \p Part.
9294   if (hasVectorValue(Def, Part))
9295     return Data.PerPartOutput[Def][Part];
9296 
9297   if (!hasScalarValue(Def, {Part, 0})) {
9298     Value *IRV = Def->getLiveInIRValue();
9299     Value *B = ILV->getBroadcastInstrs(IRV);
9300     set(Def, B, Part);
9301     return B;
9302   }
9303 
9304   Value *ScalarValue = get(Def, {Part, 0});
9305   // If we aren't vectorizing, we can just copy the scalar map values over
9306   // to the vector map.
9307   if (VF.isScalar()) {
9308     set(Def, ScalarValue, Part);
9309     return ScalarValue;
9310   }
9311 
9312   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
9313   bool IsUniform = RepR && RepR->isUniform();
9314 
9315   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9316   // Check if there is a scalar value for the selected lane.
9317   if (!hasScalarValue(Def, {Part, LastLane})) {
9318     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
9319     assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
9320            "unexpected recipe found to be invariant");
9321     IsUniform = true;
9322     LastLane = 0;
9323   }
9324 
9325   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9326 
9327   // Set the insert point after the last scalarized instruction. This
9328   // ensures the insertelement sequence will directly follow the scalar
9329   // definitions.
9330   auto OldIP = Builder.saveIP();
9331   auto NewIP = std::next(BasicBlock::iterator(LastInst));
9332   Builder.SetInsertPoint(&*NewIP);
9333 
9334   // However, if we are vectorizing, we need to construct the vector values.
9335   // If the value is known to be uniform after vectorization, we can just
9336   // broadcast the scalar value corresponding to lane zero for each unroll
9337   // iteration. Otherwise, we construct the vector values using
9338   // insertelement instructions. Since the resulting vectors are stored in
9339   // State, we will only generate the insertelements once.
9340   Value *VectorValue = nullptr;
9341   if (IsUniform) {
9342     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9343     set(Def, VectorValue, Part);
9344   } else {
9345     // Initialize packing with insertelements to start from undef.
9346     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
9347     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9348     set(Def, Undef, Part);
9349     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9350       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9351     VectorValue = get(Def, Part);
9352   }
9353   Builder.restoreIP(OldIP);
9354   return VectorValue;
9355 }
9356 
9357 // Process the loop in the VPlan-native vectorization path. This path builds
9358 // VPlan upfront in the vectorization pipeline, which allows to apply
9359 // VPlan-to-VPlan transformations from the very beginning without modifying the
9360 // input LLVM IR.
9361 static bool processLoopInVPlanNativePath(
9362     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9363     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9364     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9365     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9366     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
9367 
9368   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9369     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9370     return false;
9371   }
9372   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9373   Function *F = L->getHeader()->getParent();
9374   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9375 
9376   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9377       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
9378 
9379   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9380                                 &Hints, IAI);
9381   // Use the planner for outer loop vectorization.
9382   // TODO: CM is not used at this point inside the planner. Turn CM into an
9383   // optional argument if we don't need it in the future.
9384   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
9385 
9386   // Get user vectorization factor.
9387   ElementCount UserVF = Hints.getWidth();
9388 
9389   // Plan how to best vectorize, return the best VF and its cost.
9390   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9391 
9392   // If we are stress testing VPlan builds, do not attempt to generate vector
9393   // code. Masked vector code generation support will follow soon.
9394   // Also, do not attempt to vectorize if no vector code will be produced.
9395   if (VPlanBuildStressTest || EnableVPlanPredication ||
9396       VectorizationFactor::Disabled() == VF)
9397     return false;
9398 
9399   LVP.setBestPlan(VF.Width, 1);
9400 
9401   {
9402     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
9403                              F->getParent()->getDataLayout());
9404     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
9405                            &CM, BFI, PSI, Checks);
9406     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9407                       << L->getHeader()->getParent()->getName() << "\"\n");
9408     LVP.executePlan(LB, DT);
9409   }
9410 
9411   // Mark the loop as already vectorized to avoid vectorizing again.
9412   Hints.setAlreadyVectorized();
9413   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9414   return true;
9415 }
9416 
9417 // Emit a remark if there are stores to floats that required a floating point
9418 // extension. If the vectorized loop was generated with floating point there
9419 // will be a performance penalty from the conversion overhead and the change in
9420 // the vector width.
9421 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9422   SmallVector<Instruction *, 4> Worklist;
9423   for (BasicBlock *BB : L->getBlocks()) {
9424     for (Instruction &Inst : *BB) {
9425       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9426         if (S->getValueOperand()->getType()->isFloatTy())
9427           Worklist.push_back(S);
9428       }
9429     }
9430   }
9431 
9432   // Traverse the floating point stores upwards searching, for floating point
9433   // conversions.
9434   SmallPtrSet<const Instruction *, 4> Visited;
9435   SmallPtrSet<const Instruction *, 4> EmittedRemark;
9436   while (!Worklist.empty()) {
9437     auto *I = Worklist.pop_back_val();
9438     if (!L->contains(I))
9439       continue;
9440     if (!Visited.insert(I).second)
9441       continue;
9442 
9443     // Emit a remark if the floating point store required a floating
9444     // point conversion.
9445     // TODO: More work could be done to identify the root cause such as a
9446     // constant or a function return type and point the user to it.
9447     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9448       ORE->emit([&]() {
9449         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9450                                           I->getDebugLoc(), L->getHeader())
9451                << "floating point conversion changes vector width. "
9452                << "Mixed floating point precision requires an up/down "
9453                << "cast that will negatively impact performance.";
9454       });
9455 
9456     for (Use &Op : I->operands())
9457       if (auto *OpI = dyn_cast<Instruction>(Op))
9458         Worklist.push_back(OpI);
9459   }
9460 }
9461 
9462 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9463     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9464                                !EnableLoopInterleaving),
9465       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9466                               !EnableLoopVectorization) {}
9467 
9468 bool LoopVectorizePass::processLoop(Loop *L) {
9469   assert((EnableVPlanNativePath || L->isInnermost()) &&
9470          "VPlan-native path is not enabled. Only process inner loops.");
9471 
9472 #ifndef NDEBUG
9473   const std::string DebugLocStr = getDebugLocString(L);
9474 #endif /* NDEBUG */
9475 
9476   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
9477                     << L->getHeader()->getParent()->getName() << "\" from "
9478                     << DebugLocStr << "\n");
9479 
9480   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
9481 
9482   LLVM_DEBUG(
9483       dbgs() << "LV: Loop hints:"
9484              << " force="
9485              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9486                      ? "disabled"
9487                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9488                             ? "enabled"
9489                             : "?"))
9490              << " width=" << Hints.getWidth()
9491              << " unroll=" << Hints.getInterleave() << "\n");
9492 
9493   // Function containing loop
9494   Function *F = L->getHeader()->getParent();
9495 
9496   // Looking at the diagnostic output is the only way to determine if a loop
9497   // was vectorized (other than looking at the IR or machine code), so it
9498   // is important to generate an optimization remark for each loop. Most of
9499   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9500   // generated as OptimizationRemark and OptimizationRemarkMissed are
9501   // less verbose reporting vectorized loops and unvectorized loops that may
9502   // benefit from vectorization, respectively.
9503 
9504   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9505     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9506     return false;
9507   }
9508 
9509   PredicatedScalarEvolution PSE(*SE, *L);
9510 
9511   // Check if it is legal to vectorize the loop.
9512   LoopVectorizationRequirements Requirements(*ORE);
9513   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
9514                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9515   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9516     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9517     Hints.emitRemarkWithHints();
9518     return false;
9519   }
9520 
9521   // Check the function attributes and profiles to find out if this function
9522   // should be optimized for size.
9523   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9524       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
9525 
9526   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9527   // here. They may require CFG and instruction level transformations before
9528   // even evaluating whether vectorization is profitable. Since we cannot modify
9529   // the incoming IR, we need to build VPlan upfront in the vectorization
9530   // pipeline.
9531   if (!L->isInnermost())
9532     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9533                                         ORE, BFI, PSI, Hints);
9534 
9535   assert(L->isInnermost() && "Inner loop expected.");
9536 
9537   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9538   // count by optimizing for size, to minimize overheads.
9539   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9540   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9541     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9542                       << "This loop is worth vectorizing only if no scalar "
9543                       << "iteration overheads are incurred.");
9544     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9545       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9546     else {
9547       LLVM_DEBUG(dbgs() << "\n");
9548       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9549     }
9550   }
9551 
9552   // Check the function attributes to see if implicit floats are allowed.
9553   // FIXME: This check doesn't seem possibly correct -- what if the loop is
9554   // an integer loop and the vector instructions selected are purely integer
9555   // vector instructions?
9556   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9557     reportVectorizationFailure(
9558         "Can't vectorize when the NoImplicitFloat attribute is used",
9559         "loop not vectorized due to NoImplicitFloat attribute",
9560         "NoImplicitFloat", ORE, L);
9561     Hints.emitRemarkWithHints();
9562     return false;
9563   }
9564 
9565   // Check if the target supports potentially unsafe FP vectorization.
9566   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9567   // for the target we're vectorizing for, to make sure none of the
9568   // additional fp-math flags can help.
9569   if (Hints.isPotentiallyUnsafe() &&
9570       TTI->isFPVectorizationPotentiallyUnsafe()) {
9571     reportVectorizationFailure(
9572         "Potentially unsafe FP op prevents vectorization",
9573         "loop not vectorized due to unsafe FP support.",
9574         "UnsafeFP", ORE, L);
9575     Hints.emitRemarkWithHints();
9576     return false;
9577   }
9578 
9579   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9580   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9581 
9582   // If an override option has been passed in for interleaved accesses, use it.
9583   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9584     UseInterleaved = EnableInterleavedMemAccesses;
9585 
9586   // Analyze interleaved memory accesses.
9587   if (UseInterleaved) {
9588     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9589   }
9590 
9591   // Use the cost model.
9592   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9593                                 F, &Hints, IAI);
9594   CM.collectValuesToIgnore();
9595 
9596   // Use the planner for vectorization.
9597   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
9598 
9599   // Get user vectorization factor and interleave count.
9600   ElementCount UserVF = Hints.getWidth();
9601   unsigned UserIC = Hints.getInterleave();
9602 
9603   // Plan how to best vectorize, return the best VF and its cost.
9604   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9605 
9606   VectorizationFactor VF = VectorizationFactor::Disabled();
9607   unsigned IC = 1;
9608 
9609   if (MaybeVF) {
9610     VF = *MaybeVF;
9611     // Select the interleave count.
9612     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
9613   }
9614 
9615   // Identify the diagnostic messages that should be produced.
9616   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9617   bool VectorizeLoop = true, InterleaveLoop = true;
9618   if (Requirements.doesNotMeet(F, L, Hints)) {
9619     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
9620                          "requirements.\n");
9621     Hints.emitRemarkWithHints();
9622     return false;
9623   }
9624 
9625   if (VF.Width.isScalar()) {
9626     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9627     VecDiagMsg = std::make_pair(
9628         "VectorizationNotBeneficial",
9629         "the cost-model indicates that vectorization is not beneficial");
9630     VectorizeLoop = false;
9631   }
9632 
9633   if (!MaybeVF && UserIC > 1) {
9634     // Tell the user interleaving was avoided up-front, despite being explicitly
9635     // requested.
9636     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9637                          "interleaving should be avoided up front\n");
9638     IntDiagMsg = std::make_pair(
9639         "InterleavingAvoided",
9640         "Ignoring UserIC, because interleaving was avoided up front");
9641     InterleaveLoop = false;
9642   } else if (IC == 1 && UserIC <= 1) {
9643     // Tell the user interleaving is not beneficial.
9644     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9645     IntDiagMsg = std::make_pair(
9646         "InterleavingNotBeneficial",
9647         "the cost-model indicates that interleaving is not beneficial");
9648     InterleaveLoop = false;
9649     if (UserIC == 1) {
9650       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9651       IntDiagMsg.second +=
9652           " and is explicitly disabled or interleave count is set to 1";
9653     }
9654   } else if (IC > 1 && UserIC == 1) {
9655     // Tell the user interleaving is beneficial, but it explicitly disabled.
9656     LLVM_DEBUG(
9657         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9658     IntDiagMsg = std::make_pair(
9659         "InterleavingBeneficialButDisabled",
9660         "the cost-model indicates that interleaving is beneficial "
9661         "but is explicitly disabled or interleave count is set to 1");
9662     InterleaveLoop = false;
9663   }
9664 
9665   // Override IC if user provided an interleave count.
9666   IC = UserIC > 0 ? UserIC : IC;
9667 
9668   // Emit diagnostic messages, if any.
9669   const char *VAPassName = Hints.vectorizeAnalysisPassName();
9670   if (!VectorizeLoop && !InterleaveLoop) {
9671     // Do not vectorize or interleaving the loop.
9672     ORE->emit([&]() {
9673       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9674                                       L->getStartLoc(), L->getHeader())
9675              << VecDiagMsg.second;
9676     });
9677     ORE->emit([&]() {
9678       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9679                                       L->getStartLoc(), L->getHeader())
9680              << IntDiagMsg.second;
9681     });
9682     return false;
9683   } else if (!VectorizeLoop && InterleaveLoop) {
9684     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9685     ORE->emit([&]() {
9686       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9687                                         L->getStartLoc(), L->getHeader())
9688              << VecDiagMsg.second;
9689     });
9690   } else if (VectorizeLoop && !InterleaveLoop) {
9691     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9692                       << ") in " << DebugLocStr << '\n');
9693     ORE->emit([&]() {
9694       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9695                                         L->getStartLoc(), L->getHeader())
9696              << IntDiagMsg.second;
9697     });
9698   } else if (VectorizeLoop && InterleaveLoop) {
9699     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9700                       << ") in " << DebugLocStr << '\n');
9701     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9702   }
9703 
9704   bool DisableRuntimeUnroll = false;
9705   MDNode *OrigLoopID = L->getLoopID();
9706   {
9707     // Optimistically generate runtime checks. Drop them if they turn out to not
9708     // be profitable. Limit the scope of Checks, so the cleanup happens
9709     // immediately after vector codegeneration is done.
9710     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
9711                              F->getParent()->getDataLayout());
9712     if (!VF.Width.isScalar() || IC > 1)
9713       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
9714     LVP.setBestPlan(VF.Width, IC);
9715 
9716     using namespace ore;
9717     if (!VectorizeLoop) {
9718       assert(IC > 1 && "interleave count should not be 1 or 0");
9719       // If we decided that it is not legal to vectorize the loop, then
9720       // interleave it.
9721       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
9722                                  &CM, BFI, PSI, Checks);
9723       LVP.executePlan(Unroller, DT);
9724 
9725       ORE->emit([&]() {
9726         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9727                                   L->getHeader())
9728                << "interleaved loop (interleaved count: "
9729                << NV("InterleaveCount", IC) << ")";
9730       });
9731     } else {
9732       // If we decided that it is *legal* to vectorize the loop, then do it.
9733 
9734       // Consider vectorizing the epilogue too if it's profitable.
9735       VectorizationFactor EpilogueVF =
9736           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
9737       if (EpilogueVF.Width.isVector()) {
9738 
9739         // The first pass vectorizes the main loop and creates a scalar epilogue
9740         // to be vectorized by executing the plan (potentially with a different
9741         // factor) again shortly afterwards.
9742         EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
9743                                           EpilogueVF.Width.getKnownMinValue(),
9744                                           1);
9745         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
9746                                            EPI, &LVL, &CM, BFI, PSI, Checks);
9747 
9748         LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
9749         LVP.executePlan(MainILV, DT);
9750         ++LoopsVectorized;
9751 
9752         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9753         formLCSSARecursively(*L, *DT, LI, SE);
9754 
9755         // Second pass vectorizes the epilogue and adjusts the control flow
9756         // edges from the first pass.
9757         LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
9758         EPI.MainLoopVF = EPI.EpilogueVF;
9759         EPI.MainLoopUF = EPI.EpilogueUF;
9760         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9761                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
9762                                                  Checks);
9763         LVP.executePlan(EpilogILV, DT);
9764         ++LoopsEpilogueVectorized;
9765 
9766         if (!MainILV.areSafetyChecksAdded())
9767           DisableRuntimeUnroll = true;
9768       } else {
9769         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
9770                                &LVL, &CM, BFI, PSI, Checks);
9771         LVP.executePlan(LB, DT);
9772         ++LoopsVectorized;
9773 
9774         // Add metadata to disable runtime unrolling a scalar loop when there
9775         // are no runtime checks about strides and memory. A scalar loop that is
9776         // rarely used is not worth unrolling.
9777         if (!LB.areSafetyChecksAdded())
9778           DisableRuntimeUnroll = true;
9779       }
9780       // Report the vectorization decision.
9781       ORE->emit([&]() {
9782         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
9783                                   L->getHeader())
9784                << "vectorized loop (vectorization width: "
9785                << NV("VectorizationFactor", VF.Width)
9786                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
9787       });
9788     }
9789 
9790     if (ORE->allowExtraAnalysis(LV_NAME))
9791       checkMixedPrecision(L, ORE);
9792   }
9793 
9794   Optional<MDNode *> RemainderLoopID =
9795       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
9796                                       LLVMLoopVectorizeFollowupEpilogue});
9797   if (RemainderLoopID.hasValue()) {
9798     L->setLoopID(RemainderLoopID.getValue());
9799   } else {
9800     if (DisableRuntimeUnroll)
9801       AddRuntimeUnrollDisableMetaData(L);
9802 
9803     // Mark the loop as already vectorized to avoid vectorizing again.
9804     Hints.setAlreadyVectorized();
9805   }
9806 
9807   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9808   return true;
9809 }
9810 
9811 LoopVectorizeResult LoopVectorizePass::runImpl(
9812     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
9813     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
9814     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
9815     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
9816     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
9817   SE = &SE_;
9818   LI = &LI_;
9819   TTI = &TTI_;
9820   DT = &DT_;
9821   BFI = &BFI_;
9822   TLI = TLI_;
9823   AA = &AA_;
9824   AC = &AC_;
9825   GetLAA = &GetLAA_;
9826   DB = &DB_;
9827   ORE = &ORE_;
9828   PSI = PSI_;
9829 
9830   // Don't attempt if
9831   // 1. the target claims to have no vector registers, and
9832   // 2. interleaving won't help ILP.
9833   //
9834   // The second condition is necessary because, even if the target has no
9835   // vector registers, loop vectorization may still enable scalar
9836   // interleaving.
9837   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
9838       TTI->getMaxInterleaveFactor(1) < 2)
9839     return LoopVectorizeResult(false, false);
9840 
9841   bool Changed = false, CFGChanged = false;
9842 
9843   // The vectorizer requires loops to be in simplified form.
9844   // Since simplification may add new inner loops, it has to run before the
9845   // legality and profitability checks. This means running the loop vectorizer
9846   // will simplify all loops, regardless of whether anything end up being
9847   // vectorized.
9848   for (auto &L : *LI)
9849     Changed |= CFGChanged |=
9850         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
9851 
9852   // Build up a worklist of inner-loops to vectorize. This is necessary as
9853   // the act of vectorizing or partially unrolling a loop creates new loops
9854   // and can invalidate iterators across the loops.
9855   SmallVector<Loop *, 8> Worklist;
9856 
9857   for (Loop *L : *LI)
9858     collectSupportedLoops(*L, LI, ORE, Worklist);
9859 
9860   LoopsAnalyzed += Worklist.size();
9861 
9862   // Now walk the identified inner loops.
9863   while (!Worklist.empty()) {
9864     Loop *L = Worklist.pop_back_val();
9865 
9866     // For the inner loops we actually process, form LCSSA to simplify the
9867     // transform.
9868     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
9869 
9870     Changed |= CFGChanged |= processLoop(L);
9871   }
9872 
9873   // Process each loop nest in the function.
9874   return LoopVectorizeResult(Changed, CFGChanged);
9875 }
9876 
9877 PreservedAnalyses LoopVectorizePass::run(Function &F,
9878                                          FunctionAnalysisManager &AM) {
9879     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
9880     auto &LI = AM.getResult<LoopAnalysis>(F);
9881     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
9882     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
9883     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
9884     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
9885     auto &AA = AM.getResult<AAManager>(F);
9886     auto &AC = AM.getResult<AssumptionAnalysis>(F);
9887     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
9888     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
9889     MemorySSA *MSSA = EnableMSSALoopDependency
9890                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
9891                           : nullptr;
9892 
9893     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
9894     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
9895         [&](Loop &L) -> const LoopAccessInfo & {
9896       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
9897                                         TLI, TTI, nullptr, MSSA};
9898       return LAM.getResult<LoopAccessAnalysis>(L, AR);
9899     };
9900     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
9901     ProfileSummaryInfo *PSI =
9902         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
9903     LoopVectorizeResult Result =
9904         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
9905     if (!Result.MadeAnyChange)
9906       return PreservedAnalyses::all();
9907     PreservedAnalyses PA;
9908 
9909     // We currently do not preserve loopinfo/dominator analyses with outer loop
9910     // vectorization. Until this is addressed, mark these analyses as preserved
9911     // only for non-VPlan-native path.
9912     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
9913     if (!EnableVPlanNativePath) {
9914       PA.preserve<LoopAnalysis>();
9915       PA.preserve<DominatorTreeAnalysis>();
9916     }
9917     PA.preserve<BasicAA>();
9918     PA.preserve<GlobalsAA>();
9919     if (!Result.MadeCFGChange)
9920       PA.preserveSet<CFGAnalyses>();
9921     return PA;
9922 }
9923