1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanTransforms.h"
62 #include "llvm/ADT/APInt.h"
63 #include "llvm/ADT/ArrayRef.h"
64 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/DenseMapInfo.h"
66 #include "llvm/ADT/Hashing.h"
67 #include "llvm/ADT/MapVector.h"
68 #include "llvm/ADT/STLExtras.h"
69 #include "llvm/ADT/SmallPtrSet.h"
70 #include "llvm/ADT/SmallSet.h"
71 #include "llvm/ADT/SmallVector.h"
72 #include "llvm/ADT/Statistic.h"
73 #include "llvm/ADT/StringRef.h"
74 #include "llvm/ADT/Twine.h"
75 #include "llvm/ADT/iterator_range.h"
76 #include "llvm/Analysis/AssumptionCache.h"
77 #include "llvm/Analysis/BasicAliasAnalysis.h"
78 #include "llvm/Analysis/BlockFrequencyInfo.h"
79 #include "llvm/Analysis/CFG.h"
80 #include "llvm/Analysis/CodeMetrics.h"
81 #include "llvm/Analysis/DemandedBits.h"
82 #include "llvm/Analysis/GlobalsModRef.h"
83 #include "llvm/Analysis/LoopAccessAnalysis.h"
84 #include "llvm/Analysis/LoopAnalysisManager.h"
85 #include "llvm/Analysis/LoopInfo.h"
86 #include "llvm/Analysis/LoopIterator.h"
87 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
88 #include "llvm/Analysis/ProfileSummaryInfo.h"
89 #include "llvm/Analysis/ScalarEvolution.h"
90 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
91 #include "llvm/Analysis/TargetLibraryInfo.h"
92 #include "llvm/Analysis/TargetTransformInfo.h"
93 #include "llvm/Analysis/ValueTracking.h"
94 #include "llvm/Analysis/VectorUtils.h"
95 #include "llvm/IR/Attributes.h"
96 #include "llvm/IR/BasicBlock.h"
97 #include "llvm/IR/CFG.h"
98 #include "llvm/IR/Constant.h"
99 #include "llvm/IR/Constants.h"
100 #include "llvm/IR/DataLayout.h"
101 #include "llvm/IR/DebugInfoMetadata.h"
102 #include "llvm/IR/DebugLoc.h"
103 #include "llvm/IR/DerivedTypes.h"
104 #include "llvm/IR/DiagnosticInfo.h"
105 #include "llvm/IR/Dominators.h"
106 #include "llvm/IR/Function.h"
107 #include "llvm/IR/IRBuilder.h"
108 #include "llvm/IR/InstrTypes.h"
109 #include "llvm/IR/Instruction.h"
110 #include "llvm/IR/Instructions.h"
111 #include "llvm/IR/IntrinsicInst.h"
112 #include "llvm/IR/Intrinsics.h"
113 #include "llvm/IR/Metadata.h"
114 #include "llvm/IR/Module.h"
115 #include "llvm/IR/Operator.h"
116 #include "llvm/IR/PatternMatch.h"
117 #include "llvm/IR/Type.h"
118 #include "llvm/IR/Use.h"
119 #include "llvm/IR/User.h"
120 #include "llvm/IR/Value.h"
121 #include "llvm/IR/ValueHandle.h"
122 #include "llvm/IR/Verifier.h"
123 #include "llvm/InitializePasses.h"
124 #include "llvm/Pass.h"
125 #include "llvm/Support/Casting.h"
126 #include "llvm/Support/CommandLine.h"
127 #include "llvm/Support/Compiler.h"
128 #include "llvm/Support/Debug.h"
129 #include "llvm/Support/ErrorHandling.h"
130 #include "llvm/Support/InstructionCost.h"
131 #include "llvm/Support/MathExtras.h"
132 #include "llvm/Support/raw_ostream.h"
133 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
134 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
135 #include "llvm/Transforms/Utils/LoopSimplify.h"
136 #include "llvm/Transforms/Utils/LoopUtils.h"
137 #include "llvm/Transforms/Utils/LoopVersioning.h"
138 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
139 #include "llvm/Transforms/Utils/SizeOpts.h"
140 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
141 #include <algorithm>
142 #include <cassert>
143 #include <cmath>
144 #include <cstdint>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <map>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 #ifndef NDEBUG
160 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
161 #endif
162 
163 /// @{
164 /// Metadata attribute names
165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
166 const char LLVMLoopVectorizeFollowupVectorized[] =
167     "llvm.loop.vectorize.followup_vectorized";
168 const char LLVMLoopVectorizeFollowupEpilogue[] =
169     "llvm.loop.vectorize.followup_epilogue";
170 /// @}
171 
172 STATISTIC(LoopsVectorized, "Number of loops vectorized");
173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
175 
176 static cl::opt<bool> EnableEpilogueVectorization(
177     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
178     cl::desc("Enable vectorization of epilogue loops."));
179 
180 static cl::opt<unsigned> EpilogueVectorizationForceVF(
181     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
182     cl::desc("When epilogue vectorization is enabled, and a value greater than "
183              "1 is specified, forces the given VF for all applicable epilogue "
184              "loops."));
185 
186 static cl::opt<unsigned> EpilogueVectorizationMinVF(
187     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
188     cl::desc("Only loops with vectorization factor equal to or larger than "
189              "the specified value are considered for epilogue vectorization."));
190 
191 /// Loops with a known constant trip count below this number are vectorized only
192 /// if no scalar iteration overheads are incurred.
193 static cl::opt<unsigned> TinyTripCountVectorThreshold(
194     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
195     cl::desc("Loops with a constant trip count that is smaller than this "
196              "value are vectorized only if no scalar iteration overheads "
197              "are incurred."));
198 
199 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
200     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
201     cl::desc("The maximum allowed number of runtime memory checks"));
202 
203 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
204 // that predication is preferred, and this lists all options. I.e., the
205 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
206 // and predicate the instructions accordingly. If tail-folding fails, there are
207 // different fallback strategies depending on these values:
208 namespace PreferPredicateTy {
209   enum Option {
210     ScalarEpilogue = 0,
211     PredicateElseScalarEpilogue,
212     PredicateOrDontVectorize
213   };
214 } // namespace PreferPredicateTy
215 
216 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
217     "prefer-predicate-over-epilogue",
218     cl::init(PreferPredicateTy::ScalarEpilogue),
219     cl::Hidden,
220     cl::desc("Tail-folding and predication preferences over creating a scalar "
221              "epilogue loop."),
222     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
223                          "scalar-epilogue",
224                          "Don't tail-predicate loops, create scalar epilogue"),
225               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
226                          "predicate-else-scalar-epilogue",
227                          "prefer tail-folding, create scalar epilogue if tail "
228                          "folding fails."),
229               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
230                          "predicate-dont-vectorize",
231                          "prefers tail-folding, don't attempt vectorization if "
232                          "tail-folding fails.")));
233 
234 static cl::opt<bool> MaximizeBandwidth(
235     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
236     cl::desc("Maximize bandwidth when selecting vectorization factor which "
237              "will be determined by the smallest type in loop."));
238 
239 static cl::opt<bool> EnableInterleavedMemAccesses(
240     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
241     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
242 
243 /// An interleave-group may need masking if it resides in a block that needs
244 /// predication, or in order to mask away gaps.
245 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
246     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
247     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
248 
249 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
250     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
251     cl::desc("We don't interleave loops with a estimated constant trip count "
252              "below this number"));
253 
254 static cl::opt<unsigned> ForceTargetNumScalarRegs(
255     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
256     cl::desc("A flag that overrides the target's number of scalar registers."));
257 
258 static cl::opt<unsigned> ForceTargetNumVectorRegs(
259     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
260     cl::desc("A flag that overrides the target's number of vector registers."));
261 
262 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
263     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
264     cl::desc("A flag that overrides the target's max interleave factor for "
265              "scalar loops."));
266 
267 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
268     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
269     cl::desc("A flag that overrides the target's max interleave factor for "
270              "vectorized loops."));
271 
272 static cl::opt<unsigned> ForceTargetInstructionCost(
273     "force-target-instruction-cost", cl::init(0), cl::Hidden,
274     cl::desc("A flag that overrides the target's expected cost for "
275              "an instruction to a single constant value. Mostly "
276              "useful for getting consistent testing."));
277 
278 static cl::opt<bool> ForceTargetSupportsScalableVectors(
279     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
280     cl::desc(
281         "Pretend that scalable vectors are supported, even if the target does "
282         "not support them. This flag should only be used for testing."));
283 
284 static cl::opt<unsigned> SmallLoopCost(
285     "small-loop-cost", cl::init(20), cl::Hidden,
286     cl::desc(
287         "The cost of a loop that is considered 'small' by the interleaver."));
288 
289 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
290     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
291     cl::desc("Enable the use of the block frequency analysis to access PGO "
292              "heuristics minimizing code growth in cold regions and being more "
293              "aggressive in hot regions."));
294 
295 // Runtime interleave loops for load/store throughput.
296 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
297     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
298     cl::desc(
299         "Enable runtime interleaving until load/store ports are saturated"));
300 
301 /// Interleave small loops with scalar reductions.
302 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
303     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
304     cl::desc("Enable interleaving for loops with small iteration counts that "
305              "contain scalar reductions to expose ILP."));
306 
307 /// The number of stores in a loop that are allowed to need predication.
308 static cl::opt<unsigned> NumberOfStoresToPredicate(
309     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
310     cl::desc("Max number of stores to be predicated behind an if."));
311 
312 static cl::opt<bool> EnableIndVarRegisterHeur(
313     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
314     cl::desc("Count the induction variable only once when interleaving"));
315 
316 static cl::opt<bool> EnableCondStoresVectorization(
317     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
318     cl::desc("Enable if predication of stores during vectorization."));
319 
320 static cl::opt<unsigned> MaxNestedScalarReductionIC(
321     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
322     cl::desc("The maximum interleave count to use when interleaving a scalar "
323              "reduction in a nested loop."));
324 
325 static cl::opt<bool>
326     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
327                            cl::Hidden,
328                            cl::desc("Prefer in-loop vector reductions, "
329                                     "overriding the targets preference."));
330 
331 static cl::opt<bool> ForceOrderedReductions(
332     "force-ordered-reductions", cl::init(false), cl::Hidden,
333     cl::desc("Enable the vectorisation of loops with in-order (strict) "
334              "FP reductions"));
335 
336 static cl::opt<bool> PreferPredicatedReductionSelect(
337     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
338     cl::desc(
339         "Prefer predicating a reduction operation over an after loop select."));
340 
341 cl::opt<bool> EnableVPlanNativePath(
342     "enable-vplan-native-path", cl::init(false), cl::Hidden,
343     cl::desc("Enable VPlan-native vectorization path with "
344              "support for outer loop vectorization."));
345 
346 // This flag enables the stress testing of the VPlan H-CFG construction in the
347 // VPlan-native vectorization path. It must be used in conjuction with
348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
349 // verification of the H-CFGs built.
350 static cl::opt<bool> VPlanBuildStressTest(
351     "vplan-build-stress-test", cl::init(false), cl::Hidden,
352     cl::desc(
353         "Build VPlan for every supported loop nest in the function and bail "
354         "out right after the build (stress test the VPlan H-CFG construction "
355         "in the VPlan-native vectorization path)."));
356 
357 cl::opt<bool> llvm::EnableLoopInterleaving(
358     "interleave-loops", cl::init(true), cl::Hidden,
359     cl::desc("Enable loop interleaving in Loop vectorization passes"));
360 cl::opt<bool> llvm::EnableLoopVectorization(
361     "vectorize-loops", cl::init(true), cl::Hidden,
362     cl::desc("Run the Loop vectorization passes"));
363 
364 static cl::opt<bool> PrintVPlansInDotFormat(
365     "vplan-print-in-dot-format", cl::Hidden,
366     cl::desc("Use dot format instead of plain text when dumping VPlans"));
367 
368 static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
369     "force-widen-divrem-via-safe-divisor", cl::Hidden,
370     cl::desc(
371         "Override cost based safe divisor widening for div/rem instructions"));
372 
373 /// A helper function that returns true if the given type is irregular. The
374 /// type is irregular if its allocated size doesn't equal the store size of an
375 /// element of the corresponding vector type.
376 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
377   // Determine if an array of N elements of type Ty is "bitcast compatible"
378   // with a <N x Ty> vector.
379   // This is only true if there is no padding between the array elements.
380   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
381 }
382 
383 /// A helper function that returns the reciprocal of the block probability of
384 /// predicated blocks. If we return X, we are assuming the predicated block
385 /// will execute once for every X iterations of the loop header.
386 ///
387 /// TODO: We should use actual block probability here, if available. Currently,
388 ///       we always assume predicated blocks have a 50% chance of executing.
389 static unsigned getReciprocalPredBlockProb() { return 2; }
390 
391 /// A helper function that returns an integer or floating-point constant with
392 /// value C.
393 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
394   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
395                            : ConstantFP::get(Ty, C);
396 }
397 
398 /// Returns "best known" trip count for the specified loop \p L as defined by
399 /// the following procedure:
400 ///   1) Returns exact trip count if it is known.
401 ///   2) Returns expected trip count according to profile data if any.
402 ///   3) Returns upper bound estimate if it is known.
403 ///   4) Returns std::nullopt if all of the above failed.
404 static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
405                                                    Loop *L) {
406   // Check if exact trip count is known.
407   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
408     return ExpectedTC;
409 
410   // Check if there is an expected trip count available from profile data.
411   if (LoopVectorizeWithBlockFrequency)
412     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
413       return *EstimatedTC;
414 
415   // Check if upper bound estimate is known.
416   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
417     return ExpectedTC;
418 
419   return std::nullopt;
420 }
421 
422 namespace {
423 // Forward declare GeneratedRTChecks.
424 class GeneratedRTChecks;
425 } // namespace
426 
427 namespace llvm {
428 
429 AnalysisKey ShouldRunExtraVectorPasses::Key;
430 
431 /// InnerLoopVectorizer vectorizes loops which contain only one basic
432 /// block to a specified vectorization factor (VF).
433 /// This class performs the widening of scalars into vectors, or multiple
434 /// scalars. This class also implements the following features:
435 /// * It inserts an epilogue loop for handling loops that don't have iteration
436 ///   counts that are known to be a multiple of the vectorization factor.
437 /// * It handles the code generation for reduction variables.
438 /// * Scalarization (implementation using scalars) of un-vectorizable
439 ///   instructions.
440 /// InnerLoopVectorizer does not perform any vectorization-legality
441 /// checks, and relies on the caller to check for the different legality
442 /// aspects. The InnerLoopVectorizer relies on the
443 /// LoopVectorizationLegality class to provide information about the induction
444 /// and reduction variables that were found to a given vectorization factor.
445 class InnerLoopVectorizer {
446 public:
447   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
448                       LoopInfo *LI, DominatorTree *DT,
449                       const TargetLibraryInfo *TLI,
450                       const TargetTransformInfo *TTI, AssumptionCache *AC,
451                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
452                       ElementCount MinProfitableTripCount,
453                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
454                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
455                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
456       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
457         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
458         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
459         PSI(PSI), RTChecks(RTChecks) {
460     // Query this against the original loop and save it here because the profile
461     // of the original loop header may change as the transformation happens.
462     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
463         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
464 
465     if (MinProfitableTripCount.isZero())
466       this->MinProfitableTripCount = VecWidth;
467     else
468       this->MinProfitableTripCount = MinProfitableTripCount;
469   }
470 
471   virtual ~InnerLoopVectorizer() = default;
472 
473   /// Create a new empty loop that will contain vectorized instructions later
474   /// on, while the old loop will be used as the scalar remainder. Control flow
475   /// is generated around the vectorized (and scalar epilogue) loops consisting
476   /// of various checks and bypasses. Return the pre-header block of the new
477   /// loop and the start value for the canonical induction, if it is != 0. The
478   /// latter is the case when vectorizing the epilogue loop. In the case of
479   /// epilogue vectorization, this function is overriden to handle the more
480   /// complex control flow around the loops.
481   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
482 
483   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
484   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
485 
486   // Return true if any runtime check is added.
487   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
488 
489   /// A type for vectorized values in the new loop. Each value from the
490   /// original loop, when vectorized, is represented by UF vector values in the
491   /// new unrolled loop, where UF is the unroll factor.
492   using VectorParts = SmallVector<Value *, 2>;
493 
494   /// A helper function to scalarize a single Instruction in the innermost loop.
495   /// Generates a sequence of scalar instances for each lane between \p MinLane
496   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
497   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
498   /// Instr's operands.
499   void scalarizeInstruction(const Instruction *Instr,
500                             VPReplicateRecipe *RepRecipe,
501                             const VPIteration &Instance, bool IfPredicateInstr,
502                             VPTransformState &State);
503 
504   /// Construct the vector value of a scalarized value \p V one lane at a time.
505   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
506                                  VPTransformState &State);
507 
508   /// Try to vectorize interleaved access group \p Group with the base address
509   /// given in \p Addr, optionally masking the vector operations if \p
510   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
511   /// values in the vectorized loop.
512   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
513                                 ArrayRef<VPValue *> VPDefs,
514                                 VPTransformState &State, VPValue *Addr,
515                                 ArrayRef<VPValue *> StoredValues,
516                                 VPValue *BlockInMask = nullptr);
517 
518   /// Fix the non-induction PHIs in \p Plan.
519   void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
520 
521   /// Returns true if the reordering of FP operations is not allowed, but we are
522   /// able to vectorize with strict in-order reductions for the given RdxDesc.
523   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
524 
525   /// Create a broadcast instruction. This method generates a broadcast
526   /// instruction (shuffle) for loop invariant values and for the induction
527   /// value. If this is the induction variable then we extend it to N, N+1, ...
528   /// this is needed because each iteration in the loop corresponds to a SIMD
529   /// element.
530   virtual Value *getBroadcastInstrs(Value *V);
531 
532   // Returns the resume value (bc.merge.rdx) for a reduction as
533   // generated by fixReduction.
534   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
535 
536   /// Create a new phi node for the induction variable \p OrigPhi to resume
537   /// iteration count in the scalar epilogue, from where the vectorized loop
538   /// left off. In cases where the loop skeleton is more complicated (eg.
539   /// epilogue vectorization) and the resume values can come from an additional
540   /// bypass block, the \p AdditionalBypass pair provides information about the
541   /// bypass block and the end value on the edge from bypass to this loop.
542   PHINode *createInductionResumeValue(
543       PHINode *OrigPhi, const InductionDescriptor &ID,
544       ArrayRef<BasicBlock *> BypassBlocks,
545       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
546 
547 protected:
548   friend class LoopVectorizationPlanner;
549 
550   /// A small list of PHINodes.
551   using PhiVector = SmallVector<PHINode *, 4>;
552 
553   /// A type for scalarized values in the new loop. Each value from the
554   /// original loop, when scalarized, is represented by UF x VF scalar values
555   /// in the new unrolled loop, where UF is the unroll factor and VF is the
556   /// vectorization factor.
557   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
558 
559   /// Set up the values of the IVs correctly when exiting the vector loop.
560   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
561                     Value *VectorTripCount, Value *EndValue,
562                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
563                     VPlan &Plan);
564 
565   /// Handle all cross-iteration phis in the header.
566   void fixCrossIterationPHIs(VPTransformState &State);
567 
568   /// Create the exit value of first order recurrences in the middle block and
569   /// update their users.
570   void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
571                                VPTransformState &State);
572 
573   /// Create code for the loop exit value of the reduction.
574   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
575 
576   /// Clear NSW/NUW flags from reduction instructions if necessary.
577   void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
578                                VPTransformState &State);
579 
580   /// Iteratively sink the scalarized operands of a predicated instruction into
581   /// the block that was created for it.
582   void sinkScalarOperands(Instruction *PredInst);
583 
584   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
585   /// represented as.
586   void truncateToMinimalBitwidths(VPTransformState &State);
587 
588   /// Returns (and creates if needed) the original loop trip count.
589   Value *getOrCreateTripCount(BasicBlock *InsertBlock);
590 
591   /// Returns (and creates if needed) the trip count of the widened loop.
592   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
593 
594   /// Returns a bitcasted value to the requested vector type.
595   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
596   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
597                                 const DataLayout &DL);
598 
599   /// Emit a bypass check to see if the vector trip count is zero, including if
600   /// it overflows.
601   void emitIterationCountCheck(BasicBlock *Bypass);
602 
603   /// Emit a bypass check to see if all of the SCEV assumptions we've
604   /// had to make are correct. Returns the block containing the checks or
605   /// nullptr if no checks have been added.
606   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
607 
608   /// Emit bypass checks to check any memory assumptions we may have made.
609   /// Returns the block containing the checks or nullptr if no checks have been
610   /// added.
611   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
612 
613   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
614   /// vector loop preheader, middle block and scalar preheader.
615   void createVectorLoopSkeleton(StringRef Prefix);
616 
617   /// Create new phi nodes for the induction variables to resume iteration count
618   /// in the scalar epilogue, from where the vectorized loop left off.
619   /// In cases where the loop skeleton is more complicated (eg. epilogue
620   /// vectorization) and the resume values can come from an additional bypass
621   /// block, the \p AdditionalBypass pair provides information about the bypass
622   /// block and the end value on the edge from bypass to this loop.
623   void createInductionResumeValues(
624       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
625 
626   /// Complete the loop skeleton by adding debug MDs, creating appropriate
627   /// conditional branches in the middle block, preparing the builder and
628   /// running the verifier. Return the preheader of the completed vector loop.
629   BasicBlock *completeLoopSkeleton();
630 
631   /// Collect poison-generating recipes that may generate a poison value that is
632   /// used after vectorization, even when their operands are not poison. Those
633   /// recipes meet the following conditions:
634   ///  * Contribute to the address computation of a recipe generating a widen
635   ///    memory load/store (VPWidenMemoryInstructionRecipe or
636   ///    VPInterleaveRecipe).
637   ///  * Such a widen memory load/store has at least one underlying Instruction
638   ///    that is in a basic block that needs predication and after vectorization
639   ///    the generated instruction won't be predicated.
640   void collectPoisonGeneratingRecipes(VPTransformState &State);
641 
642   /// Allow subclasses to override and print debug traces before/after vplan
643   /// execution, when trace information is requested.
644   virtual void printDebugTracesAtStart(){};
645   virtual void printDebugTracesAtEnd(){};
646 
647   /// The original loop.
648   Loop *OrigLoop;
649 
650   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
651   /// dynamic knowledge to simplify SCEV expressions and converts them to a
652   /// more usable form.
653   PredicatedScalarEvolution &PSE;
654 
655   /// Loop Info.
656   LoopInfo *LI;
657 
658   /// Dominator Tree.
659   DominatorTree *DT;
660 
661   /// Target Library Info.
662   const TargetLibraryInfo *TLI;
663 
664   /// Target Transform Info.
665   const TargetTransformInfo *TTI;
666 
667   /// Assumption Cache.
668   AssumptionCache *AC;
669 
670   /// Interface to emit optimization remarks.
671   OptimizationRemarkEmitter *ORE;
672 
673   /// The vectorization SIMD factor to use. Each vector will have this many
674   /// vector elements.
675   ElementCount VF;
676 
677   ElementCount MinProfitableTripCount;
678 
679   /// The vectorization unroll factor to use. Each scalar is vectorized to this
680   /// many different vector instructions.
681   unsigned UF;
682 
683   /// The builder that we use
684   IRBuilder<> Builder;
685 
686   // --- Vectorization state ---
687 
688   /// The vector-loop preheader.
689   BasicBlock *LoopVectorPreHeader;
690 
691   /// The scalar-loop preheader.
692   BasicBlock *LoopScalarPreHeader;
693 
694   /// Middle Block between the vector and the scalar.
695   BasicBlock *LoopMiddleBlock;
696 
697   /// The unique ExitBlock of the scalar loop if one exists.  Note that
698   /// there can be multiple exiting edges reaching this block.
699   BasicBlock *LoopExitBlock;
700 
701   /// The scalar loop body.
702   BasicBlock *LoopScalarBody;
703 
704   /// A list of all bypass blocks. The first block is the entry of the loop.
705   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
706 
707   /// Store instructions that were predicated.
708   SmallVector<Instruction *, 4> PredicatedInstructions;
709 
710   /// Trip count of the original loop.
711   Value *TripCount = nullptr;
712 
713   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
714   Value *VectorTripCount = nullptr;
715 
716   /// The legality analysis.
717   LoopVectorizationLegality *Legal;
718 
719   /// The profitablity analysis.
720   LoopVectorizationCostModel *Cost;
721 
722   // Record whether runtime checks are added.
723   bool AddedSafetyChecks = false;
724 
725   // Holds the end values for each induction variable. We save the end values
726   // so we can later fix-up the external users of the induction variables.
727   DenseMap<PHINode *, Value *> IVEndValues;
728 
729   /// BFI and PSI are used to check for profile guided size optimizations.
730   BlockFrequencyInfo *BFI;
731   ProfileSummaryInfo *PSI;
732 
733   // Whether this loop should be optimized for size based on profile guided size
734   // optimizatios.
735   bool OptForSizeBasedOnProfile;
736 
737   /// Structure to hold information about generated runtime checks, responsible
738   /// for cleaning the checks, if vectorization turns out unprofitable.
739   GeneratedRTChecks &RTChecks;
740 
741   // Holds the resume values for reductions in the loops, used to set the
742   // correct start value of reduction PHIs when vectorizing the epilogue.
743   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
744       ReductionResumeValues;
745 };
746 
747 class InnerLoopUnroller : public InnerLoopVectorizer {
748 public:
749   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
750                     LoopInfo *LI, DominatorTree *DT,
751                     const TargetLibraryInfo *TLI,
752                     const TargetTransformInfo *TTI, AssumptionCache *AC,
753                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
754                     LoopVectorizationLegality *LVL,
755                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
756                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
757       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
758                             ElementCount::getFixed(1),
759                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
760                             BFI, PSI, Check) {}
761 
762 private:
763   Value *getBroadcastInstrs(Value *V) override;
764 };
765 
766 /// Encapsulate information regarding vectorization of a loop and its epilogue.
767 /// This information is meant to be updated and used across two stages of
768 /// epilogue vectorization.
769 struct EpilogueLoopVectorizationInfo {
770   ElementCount MainLoopVF = ElementCount::getFixed(0);
771   unsigned MainLoopUF = 0;
772   ElementCount EpilogueVF = ElementCount::getFixed(0);
773   unsigned EpilogueUF = 0;
774   BasicBlock *MainLoopIterationCountCheck = nullptr;
775   BasicBlock *EpilogueIterationCountCheck = nullptr;
776   BasicBlock *SCEVSafetyCheck = nullptr;
777   BasicBlock *MemSafetyCheck = nullptr;
778   Value *TripCount = nullptr;
779   Value *VectorTripCount = nullptr;
780 
781   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
782                                 ElementCount EVF, unsigned EUF)
783       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
784     assert(EUF == 1 &&
785            "A high UF for the epilogue loop is likely not beneficial.");
786   }
787 };
788 
789 /// An extension of the inner loop vectorizer that creates a skeleton for a
790 /// vectorized loop that has its epilogue (residual) also vectorized.
791 /// The idea is to run the vplan on a given loop twice, firstly to setup the
792 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
793 /// from the first step and vectorize the epilogue.  This is achieved by
794 /// deriving two concrete strategy classes from this base class and invoking
795 /// them in succession from the loop vectorizer planner.
796 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
797 public:
798   InnerLoopAndEpilogueVectorizer(
799       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
800       DominatorTree *DT, const TargetLibraryInfo *TLI,
801       const TargetTransformInfo *TTI, AssumptionCache *AC,
802       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
803       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
804       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
805       GeneratedRTChecks &Checks)
806       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
807                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
808                             CM, BFI, PSI, Checks),
809         EPI(EPI) {}
810 
811   // Override this function to handle the more complex control flow around the
812   // three loops.
813   std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final {
814     return createEpilogueVectorizedLoopSkeleton();
815   }
816 
817   /// The interface for creating a vectorized skeleton using one of two
818   /// different strategies, each corresponding to one execution of the vplan
819   /// as described above.
820   virtual std::pair<BasicBlock *, Value *>
821   createEpilogueVectorizedLoopSkeleton() = 0;
822 
823   /// Holds and updates state information required to vectorize the main loop
824   /// and its epilogue in two separate passes. This setup helps us avoid
825   /// regenerating and recomputing runtime safety checks. It also helps us to
826   /// shorten the iteration-count-check path length for the cases where the
827   /// iteration count of the loop is so small that the main vector loop is
828   /// completely skipped.
829   EpilogueLoopVectorizationInfo &EPI;
830 };
831 
832 /// A specialized derived class of inner loop vectorizer that performs
833 /// vectorization of *main* loops in the process of vectorizing loops and their
834 /// epilogues.
835 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
836 public:
837   EpilogueVectorizerMainLoop(
838       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
839       DominatorTree *DT, const TargetLibraryInfo *TLI,
840       const TargetTransformInfo *TTI, AssumptionCache *AC,
841       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
842       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
843       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
844       GeneratedRTChecks &Check)
845       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
846                                        EPI, LVL, CM, BFI, PSI, Check) {}
847   /// Implements the interface for creating a vectorized skeleton using the
848   /// *main loop* strategy (ie the first pass of vplan execution).
849   std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
850 
851 protected:
852   /// Emits an iteration count bypass check once for the main loop (when \p
853   /// ForEpilogue is false) and once for the epilogue loop (when \p
854   /// ForEpilogue is true).
855   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
856   void printDebugTracesAtStart() override;
857   void printDebugTracesAtEnd() override;
858 };
859 
860 // A specialized derived class of inner loop vectorizer that performs
861 // vectorization of *epilogue* loops in the process of vectorizing loops and
862 // their epilogues.
863 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
864 public:
865   EpilogueVectorizerEpilogueLoop(
866       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
867       DominatorTree *DT, const TargetLibraryInfo *TLI,
868       const TargetTransformInfo *TTI, AssumptionCache *AC,
869       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
870       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
871       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
872       GeneratedRTChecks &Checks)
873       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
874                                        EPI, LVL, CM, BFI, PSI, Checks) {
875     TripCount = EPI.TripCount;
876   }
877   /// Implements the interface for creating a vectorized skeleton using the
878   /// *epilogue loop* strategy (ie the second pass of vplan execution).
879   std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
880 
881 protected:
882   /// Emits an iteration count bypass check after the main vector loop has
883   /// finished to see if there are any iterations left to execute by either
884   /// the vector epilogue or the scalar epilogue.
885   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
886                                                       BasicBlock *Bypass,
887                                                       BasicBlock *Insert);
888   void printDebugTracesAtStart() override;
889   void printDebugTracesAtEnd() override;
890 };
891 } // end namespace llvm
892 
893 /// Look for a meaningful debug location on the instruction or it's
894 /// operands.
895 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
896   if (!I)
897     return I;
898 
899   DebugLoc Empty;
900   if (I->getDebugLoc() != Empty)
901     return I;
902 
903   for (Use &Op : I->operands()) {
904     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
905       if (OpInst->getDebugLoc() != Empty)
906         return OpInst;
907   }
908 
909   return I;
910 }
911 
912 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
913 /// is passed, the message relates to that particular instruction.
914 #ifndef NDEBUG
915 static void debugVectorizationMessage(const StringRef Prefix,
916                                       const StringRef DebugMsg,
917                                       Instruction *I) {
918   dbgs() << "LV: " << Prefix << DebugMsg;
919   if (I != nullptr)
920     dbgs() << " " << *I;
921   else
922     dbgs() << '.';
923   dbgs() << '\n';
924 }
925 #endif
926 
927 /// Create an analysis remark that explains why vectorization failed
928 ///
929 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
930 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
931 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
932 /// the location of the remark.  \return the remark object that can be
933 /// streamed to.
934 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
935     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
936   Value *CodeRegion = TheLoop->getHeader();
937   DebugLoc DL = TheLoop->getStartLoc();
938 
939   if (I) {
940     CodeRegion = I->getParent();
941     // If there is no debug location attached to the instruction, revert back to
942     // using the loop's.
943     if (I->getDebugLoc())
944       DL = I->getDebugLoc();
945   }
946 
947   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
948 }
949 
950 namespace llvm {
951 
952 /// Return a value for Step multiplied by VF.
953 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
954                        int64_t Step) {
955   assert(Ty->isIntegerTy() && "Expected an integer step");
956   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
957   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
958 }
959 
960 /// Return the runtime value for VF.
961 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
962   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
963   return VF.isScalable() ? B.CreateVScale(EC) : EC;
964 }
965 
966 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE) {
967   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
968   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
969 
970   ScalarEvolution &SE = *PSE.getSE();
971 
972   // The exit count might have the type of i64 while the phi is i32. This can
973   // happen if we have an induction variable that is sign extended before the
974   // compare. The only way that we get a backedge taken count is that the
975   // induction variable was signed and as such will not overflow. In such a case
976   // truncation is legal.
977   if (SE.getTypeSizeInBits(BackedgeTakenCount->getType()) >
978       IdxTy->getPrimitiveSizeInBits())
979     BackedgeTakenCount = SE.getTruncateOrNoop(BackedgeTakenCount, IdxTy);
980   BackedgeTakenCount = SE.getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
981 
982   // Get the total trip count from the count by adding 1.
983   return SE.getAddExpr(BackedgeTakenCount,
984                        SE.getOne(BackedgeTakenCount->getType()));
985 }
986 
987 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
988                                   ElementCount VF) {
989   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
990   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
991   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
992   return B.CreateUIToFP(RuntimeVF, FTy);
993 }
994 
995 void reportVectorizationFailure(const StringRef DebugMsg,
996                                 const StringRef OREMsg, const StringRef ORETag,
997                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
998                                 Instruction *I) {
999   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1000   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1001   ORE->emit(
1002       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1003       << "loop not vectorized: " << OREMsg);
1004 }
1005 
1006 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1007                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1008                              Instruction *I) {
1009   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1010   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1011   ORE->emit(
1012       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1013       << Msg);
1014 }
1015 
1016 } // end namespace llvm
1017 
1018 #ifndef NDEBUG
1019 /// \return string containing a file name and a line # for the given loop.
1020 static std::string getDebugLocString(const Loop *L) {
1021   std::string Result;
1022   if (L) {
1023     raw_string_ostream OS(Result);
1024     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1025       LoopDbgLoc.print(OS);
1026     else
1027       // Just print the module name.
1028       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1029     OS.flush();
1030   }
1031   return Result;
1032 }
1033 #endif
1034 
1035 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1036     VPTransformState &State) {
1037 
1038   // Collect recipes in the backward slice of `Root` that may generate a poison
1039   // value that is used after vectorization.
1040   SmallPtrSet<VPRecipeBase *, 16> Visited;
1041   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1042     SmallVector<VPRecipeBase *, 16> Worklist;
1043     Worklist.push_back(Root);
1044 
1045     // Traverse the backward slice of Root through its use-def chain.
1046     while (!Worklist.empty()) {
1047       VPRecipeBase *CurRec = Worklist.back();
1048       Worklist.pop_back();
1049 
1050       if (!Visited.insert(CurRec).second)
1051         continue;
1052 
1053       // Prune search if we find another recipe generating a widen memory
1054       // instruction. Widen memory instructions involved in address computation
1055       // will lead to gather/scatter instructions, which don't need to be
1056       // handled.
1057       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1058           isa<VPInterleaveRecipe>(CurRec) ||
1059           isa<VPScalarIVStepsRecipe>(CurRec) ||
1060           isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1061           isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1062         continue;
1063 
1064       // This recipe contributes to the address computation of a widen
1065       // load/store. Collect recipe if its underlying instruction has
1066       // poison-generating flags.
1067       Instruction *Instr = CurRec->getUnderlyingInstr();
1068       if (Instr && Instr->hasPoisonGeneratingFlags())
1069         State.MayGeneratePoisonRecipes.insert(CurRec);
1070 
1071       // Add new definitions to the worklist.
1072       for (VPValue *operand : CurRec->operands())
1073         if (VPRecipeBase *OpDef = operand->getDefiningRecipe())
1074           Worklist.push_back(OpDef);
1075     }
1076   });
1077 
1078   // Traverse all the recipes in the VPlan and collect the poison-generating
1079   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1080   // VPInterleaveRecipe.
1081   auto Iter = vp_depth_first_deep(State.Plan->getEntry());
1082   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1083     for (VPRecipeBase &Recipe : *VPBB) {
1084       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1085         Instruction &UnderlyingInstr = WidenRec->getIngredient();
1086         VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
1087         if (AddrDef && WidenRec->isConsecutive() &&
1088             Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1089           collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1090       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1091         VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
1092         if (AddrDef) {
1093           // Check if any member of the interleave group needs predication.
1094           const InterleaveGroup<Instruction> *InterGroup =
1095               InterleaveRec->getInterleaveGroup();
1096           bool NeedPredication = false;
1097           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1098                I < NumMembers; ++I) {
1099             Instruction *Member = InterGroup->getMember(I);
1100             if (Member)
1101               NeedPredication |=
1102                   Legal->blockNeedsPredication(Member->getParent());
1103           }
1104 
1105           if (NeedPredication)
1106             collectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
1107         }
1108       }
1109     }
1110   }
1111 }
1112 
1113 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1114     const RecurrenceDescriptor &RdxDesc) {
1115   auto It = ReductionResumeValues.find(&RdxDesc);
1116   assert(It != ReductionResumeValues.end() &&
1117          "Expected to find a resume value for the reduction.");
1118   return It->second;
1119 }
1120 
1121 namespace llvm {
1122 
1123 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1124 // lowered.
1125 enum ScalarEpilogueLowering {
1126 
1127   // The default: allowing scalar epilogues.
1128   CM_ScalarEpilogueAllowed,
1129 
1130   // Vectorization with OptForSize: don't allow epilogues.
1131   CM_ScalarEpilogueNotAllowedOptSize,
1132 
1133   // A special case of vectorisation with OptForSize: loops with a very small
1134   // trip count are considered for vectorization under OptForSize, thereby
1135   // making sure the cost of their loop body is dominant, free of runtime
1136   // guards and scalar iteration overheads.
1137   CM_ScalarEpilogueNotAllowedLowTripLoop,
1138 
1139   // Loop hint predicate indicating an epilogue is undesired.
1140   CM_ScalarEpilogueNotNeededUsePredicate,
1141 
1142   // Directive indicating we must either tail fold or not vectorize
1143   CM_ScalarEpilogueNotAllowedUsePredicate
1144 };
1145 
1146 /// ElementCountComparator creates a total ordering for ElementCount
1147 /// for the purposes of using it in a set structure.
1148 struct ElementCountComparator {
1149   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1150     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1151            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1152   }
1153 };
1154 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1155 
1156 /// LoopVectorizationCostModel - estimates the expected speedups due to
1157 /// vectorization.
1158 /// In many cases vectorization is not profitable. This can happen because of
1159 /// a number of reasons. In this class we mainly attempt to predict the
1160 /// expected speedup/slowdowns due to the supported instruction set. We use the
1161 /// TargetTransformInfo to query the different backends for the cost of
1162 /// different operations.
1163 class LoopVectorizationCostModel {
1164 public:
1165   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1166                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1167                              LoopVectorizationLegality *Legal,
1168                              const TargetTransformInfo &TTI,
1169                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1170                              AssumptionCache *AC,
1171                              OptimizationRemarkEmitter *ORE, const Function *F,
1172                              const LoopVectorizeHints *Hints,
1173                              InterleavedAccessInfo &IAI)
1174       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1175         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1176         Hints(Hints), InterleaveInfo(IAI) {}
1177 
1178   /// \return An upper bound for the vectorization factors (both fixed and
1179   /// scalable). If the factors are 0, vectorization and interleaving should be
1180   /// avoided up front.
1181   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1182 
1183   /// \return True if runtime checks are required for vectorization, and false
1184   /// otherwise.
1185   bool runtimeChecksRequired();
1186 
1187   /// \return The most profitable vectorization factor and the cost of that VF.
1188   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1189   /// then this vectorization factor will be selected if vectorization is
1190   /// possible.
1191   VectorizationFactor
1192   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1193 
1194   VectorizationFactor
1195   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1196                                     const LoopVectorizationPlanner &LVP);
1197 
1198   /// Setup cost-based decisions for user vectorization factor.
1199   /// \return true if the UserVF is a feasible VF to be chosen.
1200   bool selectUserVectorizationFactor(ElementCount UserVF) {
1201     collectUniformsAndScalars(UserVF);
1202     collectInstsToScalarize(UserVF);
1203     return expectedCost(UserVF).first.isValid();
1204   }
1205 
1206   /// \return The size (in bits) of the smallest and widest types in the code
1207   /// that needs to be vectorized. We ignore values that remain scalar such as
1208   /// 64 bit loop indices.
1209   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1210 
1211   /// \return The desired interleave count.
1212   /// If interleave count has been specified by metadata it will be returned.
1213   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1214   /// are the selected vectorization factor and the cost of the selected VF.
1215   unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1216 
1217   /// Memory access instruction may be vectorized in more than one way.
1218   /// Form of instruction after vectorization depends on cost.
1219   /// This function takes cost-based decisions for Load/Store instructions
1220   /// and collects them in a map. This decisions map is used for building
1221   /// the lists of loop-uniform and loop-scalar instructions.
1222   /// The calculated cost is saved with widening decision in order to
1223   /// avoid redundant calculations.
1224   void setCostBasedWideningDecision(ElementCount VF);
1225 
1226   /// A struct that represents some properties of the register usage
1227   /// of a loop.
1228   struct RegisterUsage {
1229     /// Holds the number of loop invariant values that are used in the loop.
1230     /// The key is ClassID of target-provided register class.
1231     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1232     /// Holds the maximum number of concurrent live intervals in the loop.
1233     /// The key is ClassID of target-provided register class.
1234     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1235   };
1236 
1237   /// \return Returns information about the register usages of the loop for the
1238   /// given vectorization factors.
1239   SmallVector<RegisterUsage, 8>
1240   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1241 
1242   /// Collect values we want to ignore in the cost model.
1243   void collectValuesToIgnore();
1244 
1245   /// Collect all element types in the loop for which widening is needed.
1246   void collectElementTypesForWidening();
1247 
1248   /// Split reductions into those that happen in the loop, and those that happen
1249   /// outside. In loop reductions are collected into InLoopReductionChains.
1250   void collectInLoopReductions();
1251 
1252   /// Returns true if we should use strict in-order reductions for the given
1253   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1254   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1255   /// of FP operations.
1256   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1257     return !Hints->allowReordering() && RdxDesc.isOrdered();
1258   }
1259 
1260   /// \returns The smallest bitwidth each instruction can be represented with.
1261   /// The vector equivalents of these instructions should be truncated to this
1262   /// type.
1263   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1264     return MinBWs;
1265   }
1266 
1267   /// \returns True if it is more profitable to scalarize instruction \p I for
1268   /// vectorization factor \p VF.
1269   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1270     assert(VF.isVector() &&
1271            "Profitable to scalarize relevant only for VF > 1.");
1272 
1273     // Cost model is not run in the VPlan-native path - return conservative
1274     // result until this changes.
1275     if (EnableVPlanNativePath)
1276       return false;
1277 
1278     auto Scalars = InstsToScalarize.find(VF);
1279     assert(Scalars != InstsToScalarize.end() &&
1280            "VF not yet analyzed for scalarization profitability");
1281     return Scalars->second.find(I) != Scalars->second.end();
1282   }
1283 
1284   /// Returns true if \p I is known to be uniform after vectorization.
1285   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1286     if (VF.isScalar())
1287       return true;
1288 
1289     // Cost model is not run in the VPlan-native path - return conservative
1290     // result until this changes.
1291     if (EnableVPlanNativePath)
1292       return false;
1293 
1294     auto UniformsPerVF = Uniforms.find(VF);
1295     assert(UniformsPerVF != Uniforms.end() &&
1296            "VF not yet analyzed for uniformity");
1297     return UniformsPerVF->second.count(I);
1298   }
1299 
1300   /// Returns true if \p I is known to be scalar after vectorization.
1301   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1302     if (VF.isScalar())
1303       return true;
1304 
1305     // Cost model is not run in the VPlan-native path - return conservative
1306     // result until this changes.
1307     if (EnableVPlanNativePath)
1308       return false;
1309 
1310     auto ScalarsPerVF = Scalars.find(VF);
1311     assert(ScalarsPerVF != Scalars.end() &&
1312            "Scalar values are not calculated for VF");
1313     return ScalarsPerVF->second.count(I);
1314   }
1315 
1316   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1317   /// for vectorization factor \p VF.
1318   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1319     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1320            !isProfitableToScalarize(I, VF) &&
1321            !isScalarAfterVectorization(I, VF);
1322   }
1323 
1324   /// Decision that was taken during cost calculation for memory instruction.
1325   enum InstWidening {
1326     CM_Unknown,
1327     CM_Widen,         // For consecutive accesses with stride +1.
1328     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1329     CM_Interleave,
1330     CM_GatherScatter,
1331     CM_Scalarize
1332   };
1333 
1334   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1335   /// instruction \p I and vector width \p VF.
1336   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1337                            InstructionCost Cost) {
1338     assert(VF.isVector() && "Expected VF >=2");
1339     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1340   }
1341 
1342   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1343   /// interleaving group \p Grp and vector width \p VF.
1344   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1345                            ElementCount VF, InstWidening W,
1346                            InstructionCost Cost) {
1347     assert(VF.isVector() && "Expected VF >=2");
1348     /// Broadcast this decicion to all instructions inside the group.
1349     /// But the cost will be assigned to one instruction only.
1350     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1351       if (auto *I = Grp->getMember(i)) {
1352         if (Grp->getInsertPos() == I)
1353           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1354         else
1355           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1356       }
1357     }
1358   }
1359 
1360   /// Return the cost model decision for the given instruction \p I and vector
1361   /// width \p VF. Return CM_Unknown if this instruction did not pass
1362   /// through the cost modeling.
1363   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1364     assert(VF.isVector() && "Expected VF to be a vector VF");
1365     // Cost model is not run in the VPlan-native path - return conservative
1366     // result until this changes.
1367     if (EnableVPlanNativePath)
1368       return CM_GatherScatter;
1369 
1370     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1371     auto Itr = WideningDecisions.find(InstOnVF);
1372     if (Itr == WideningDecisions.end())
1373       return CM_Unknown;
1374     return Itr->second.first;
1375   }
1376 
1377   /// Return the vectorization cost for the given instruction \p I and vector
1378   /// width \p VF.
1379   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1380     assert(VF.isVector() && "Expected VF >=2");
1381     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1382     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1383            "The cost is not calculated");
1384     return WideningDecisions[InstOnVF].second;
1385   }
1386 
1387   /// Return True if instruction \p I is an optimizable truncate whose operand
1388   /// is an induction variable. Such a truncate will be removed by adding a new
1389   /// induction variable with the destination type.
1390   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1391     // If the instruction is not a truncate, return false.
1392     auto *Trunc = dyn_cast<TruncInst>(I);
1393     if (!Trunc)
1394       return false;
1395 
1396     // Get the source and destination types of the truncate.
1397     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1398     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1399 
1400     // If the truncate is free for the given types, return false. Replacing a
1401     // free truncate with an induction variable would add an induction variable
1402     // update instruction to each iteration of the loop. We exclude from this
1403     // check the primary induction variable since it will need an update
1404     // instruction regardless.
1405     Value *Op = Trunc->getOperand(0);
1406     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1407       return false;
1408 
1409     // If the truncated value is not an induction variable, return false.
1410     return Legal->isInductionPhi(Op);
1411   }
1412 
1413   /// Collects the instructions to scalarize for each predicated instruction in
1414   /// the loop.
1415   void collectInstsToScalarize(ElementCount VF);
1416 
1417   /// Collect Uniform and Scalar values for the given \p VF.
1418   /// The sets depend on CM decision for Load/Store instructions
1419   /// that may be vectorized as interleave, gather-scatter or scalarized.
1420   void collectUniformsAndScalars(ElementCount VF) {
1421     // Do the analysis once.
1422     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1423       return;
1424     setCostBasedWideningDecision(VF);
1425     collectLoopUniforms(VF);
1426     collectLoopScalars(VF);
1427   }
1428 
1429   /// Returns true if the target machine supports masked store operation
1430   /// for the given \p DataType and kind of access to \p Ptr.
1431   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1432     return Legal->isConsecutivePtr(DataType, Ptr) &&
1433            TTI.isLegalMaskedStore(DataType, Alignment);
1434   }
1435 
1436   /// Returns true if the target machine supports masked load operation
1437   /// for the given \p DataType and kind of access to \p Ptr.
1438   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1439     return Legal->isConsecutivePtr(DataType, Ptr) &&
1440            TTI.isLegalMaskedLoad(DataType, Alignment);
1441   }
1442 
1443   /// Returns true if the target machine can represent \p V as a masked gather
1444   /// or scatter operation.
1445   bool isLegalGatherOrScatter(Value *V,
1446                               ElementCount VF = ElementCount::getFixed(1)) {
1447     bool LI = isa<LoadInst>(V);
1448     bool SI = isa<StoreInst>(V);
1449     if (!LI && !SI)
1450       return false;
1451     auto *Ty = getLoadStoreType(V);
1452     Align Align = getLoadStoreAlignment(V);
1453     if (VF.isVector())
1454       Ty = VectorType::get(Ty, VF);
1455     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1456            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1457   }
1458 
1459   /// Returns true if the target machine supports all of the reduction
1460   /// variables found for the given VF.
1461   bool canVectorizeReductions(ElementCount VF) const {
1462     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1463       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1464       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1465     }));
1466   }
1467 
1468   /// Given costs for both strategies, return true if the scalar predication
1469   /// lowering should be used for div/rem.  This incorporates an override
1470   /// option so it is not simply a cost comparison.
1471   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1472                                      InstructionCost SafeDivisorCost) const {
1473     switch (ForceSafeDivisor) {
1474     case cl::BOU_UNSET:
1475       return ScalarCost < SafeDivisorCost;
1476     case cl::BOU_TRUE:
1477       return false;
1478     case cl::BOU_FALSE:
1479       return true;
1480     };
1481     llvm_unreachable("impossible case value");
1482   }
1483 
1484   /// Returns true if \p I is an instruction which requires predication and
1485   /// for which our chosen predication strategy is scalarization (i.e. we
1486   /// don't have an alternate strategy such as masking available).
1487   /// \p VF is the vectorization factor that will be used to vectorize \p I.
1488   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1489 
1490   /// Returns true if \p I is an instruction that needs to be predicated
1491   /// at runtime.  The result is independent of the predication mechanism.
1492   /// Superset of instructions that return true for isScalarWithPredication.
1493   bool isPredicatedInst(Instruction *I) const;
1494 
1495   /// Return the costs for our two available strategies for lowering a
1496   /// div/rem operation which requires speculating at least one lane.
1497   /// First result is for scalarization (will be invalid for scalable
1498   /// vectors); second is for the safe-divisor strategy.
1499   std::pair<InstructionCost, InstructionCost>
1500   getDivRemSpeculationCost(Instruction *I,
1501                            ElementCount VF) const;
1502 
1503   /// Returns true if \p I is a memory instruction with consecutive memory
1504   /// access that can be widened.
1505   bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1506 
1507   /// Returns true if \p I is a memory instruction in an interleaved-group
1508   /// of memory accesses that can be vectorized with wide vector loads/stores
1509   /// and shuffles.
1510   bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1511 
1512   /// Check if \p Instr belongs to any interleaved access group.
1513   bool isAccessInterleaved(Instruction *Instr) {
1514     return InterleaveInfo.isInterleaved(Instr);
1515   }
1516 
1517   /// Get the interleaved access group that \p Instr belongs to.
1518   const InterleaveGroup<Instruction> *
1519   getInterleavedAccessGroup(Instruction *Instr) {
1520     return InterleaveInfo.getInterleaveGroup(Instr);
1521   }
1522 
1523   /// Returns true if we're required to use a scalar epilogue for at least
1524   /// the final iteration of the original loop.
1525   bool requiresScalarEpilogue(ElementCount VF) const {
1526     if (!isScalarEpilogueAllowed())
1527       return false;
1528     // If we might exit from anywhere but the latch, must run the exiting
1529     // iteration in scalar form.
1530     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1531       return true;
1532     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1533   }
1534 
1535   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1536   /// loop hint annotation.
1537   bool isScalarEpilogueAllowed() const {
1538     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1539   }
1540 
1541   /// Returns true if all loop blocks should be masked to fold tail loop.
1542   bool foldTailByMasking() const { return FoldTailByMasking; }
1543 
1544   /// Returns true if were tail-folding and want to use the active lane mask
1545   /// for vector loop control flow.
1546   bool useActiveLaneMaskForControlFlow() const {
1547     return FoldTailByMasking &&
1548            TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
1549   }
1550 
1551   /// Returns true if the instructions in this block requires predication
1552   /// for any reason, e.g. because tail folding now requires a predicate
1553   /// or because the block in the original loop was predicated.
1554   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1555     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1556   }
1557 
1558   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1559   /// nodes to the chain of instructions representing the reductions. Uses a
1560   /// MapVector to ensure deterministic iteration order.
1561   using ReductionChainMap =
1562       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1563 
1564   /// Return the chain of instructions representing an inloop reduction.
1565   const ReductionChainMap &getInLoopReductionChains() const {
1566     return InLoopReductionChains;
1567   }
1568 
1569   /// Returns true if the Phi is part of an inloop reduction.
1570   bool isInLoopReduction(PHINode *Phi) const {
1571     return InLoopReductionChains.count(Phi);
1572   }
1573 
1574   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1575   /// with factor VF.  Return the cost of the instruction, including
1576   /// scalarization overhead if it's needed.
1577   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1578 
1579   /// Estimate cost of a call instruction CI if it were vectorized with factor
1580   /// VF. Return the cost of the instruction, including scalarization overhead
1581   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1582   /// scalarized -
1583   /// i.e. either vector version isn't available, or is too expensive.
1584   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1585                                     bool &NeedToScalarize) const;
1586 
1587   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1588   /// that of B.
1589   bool isMoreProfitable(const VectorizationFactor &A,
1590                         const VectorizationFactor &B) const;
1591 
1592   /// Invalidates decisions already taken by the cost model.
1593   void invalidateCostModelingDecisions() {
1594     WideningDecisions.clear();
1595     Uniforms.clear();
1596     Scalars.clear();
1597   }
1598 
1599   /// Convenience function that returns the value of vscale_range iff
1600   /// vscale_range.min == vscale_range.max or otherwise returns the value
1601   /// returned by the corresponding TLI method.
1602   std::optional<unsigned> getVScaleForTuning() const;
1603 
1604 private:
1605   unsigned NumPredStores = 0;
1606 
1607   /// \return An upper bound for the vectorization factors for both
1608   /// fixed and scalable vectorization, where the minimum-known number of
1609   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1610   /// disabled or unsupported, then the scalable part will be equal to
1611   /// ElementCount::getScalable(0).
1612   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1613                                            ElementCount UserVF,
1614                                            bool FoldTailByMasking);
1615 
1616   /// \return the maximized element count based on the targets vector
1617   /// registers and the loop trip-count, but limited to a maximum safe VF.
1618   /// This is a helper function of computeFeasibleMaxVF.
1619   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1620                                        unsigned SmallestType,
1621                                        unsigned WidestType,
1622                                        ElementCount MaxSafeVF,
1623                                        bool FoldTailByMasking);
1624 
1625   /// \return the maximum legal scalable VF, based on the safe max number
1626   /// of elements.
1627   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1628 
1629   /// The vectorization cost is a combination of the cost itself and a boolean
1630   /// indicating whether any of the contributing operations will actually
1631   /// operate on vector values after type legalization in the backend. If this
1632   /// latter value is false, then all operations will be scalarized (i.e. no
1633   /// vectorization has actually taken place).
1634   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1635 
1636   /// Returns the expected execution cost. The unit of the cost does
1637   /// not matter because we use the 'cost' units to compare different
1638   /// vector widths. The cost that is returned is *not* normalized by
1639   /// the factor width. If \p Invalid is not nullptr, this function
1640   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1641   /// each instruction that has an Invalid cost for the given VF.
1642   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1643   VectorizationCostTy
1644   expectedCost(ElementCount VF,
1645                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1646 
1647   /// Returns the execution time cost of an instruction for a given vector
1648   /// width. Vector width of one means scalar.
1649   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1650 
1651   /// The cost-computation logic from getInstructionCost which provides
1652   /// the vector type as an output parameter.
1653   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1654                                      Type *&VectorTy);
1655 
1656   /// Return the cost of instructions in an inloop reduction pattern, if I is
1657   /// part of that pattern.
1658   std::optional<InstructionCost>
1659   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1660                           TTI::TargetCostKind CostKind);
1661 
1662   /// Calculate vectorization cost of memory instruction \p I.
1663   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1664 
1665   /// The cost computation for scalarized memory instruction.
1666   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1667 
1668   /// The cost computation for interleaving group of memory instructions.
1669   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1670 
1671   /// The cost computation for Gather/Scatter instruction.
1672   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1673 
1674   /// The cost computation for widening instruction \p I with consecutive
1675   /// memory access.
1676   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1677 
1678   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1679   /// Load: scalar load + broadcast.
1680   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1681   /// element)
1682   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1683 
1684   /// Estimate the overhead of scalarizing an instruction. This is a
1685   /// convenience wrapper for the type-based getScalarizationOverhead API.
1686   InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1687                                            TTI::TargetCostKind CostKind) const;
1688 
1689   /// Returns true if an artificially high cost for emulated masked memrefs
1690   /// should be used.
1691   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1692 
1693   /// Map of scalar integer values to the smallest bitwidth they can be legally
1694   /// represented as. The vector equivalents of these values should be truncated
1695   /// to this type.
1696   MapVector<Instruction *, uint64_t> MinBWs;
1697 
1698   /// A type representing the costs for instructions if they were to be
1699   /// scalarized rather than vectorized. The entries are Instruction-Cost
1700   /// pairs.
1701   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1702 
1703   /// A set containing all BasicBlocks that are known to present after
1704   /// vectorization as a predicated block.
1705   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1706       PredicatedBBsAfterVectorization;
1707 
1708   /// Records whether it is allowed to have the original scalar loop execute at
1709   /// least once. This may be needed as a fallback loop in case runtime
1710   /// aliasing/dependence checks fail, or to handle the tail/remainder
1711   /// iterations when the trip count is unknown or doesn't divide by the VF,
1712   /// or as a peel-loop to handle gaps in interleave-groups.
1713   /// Under optsize and when the trip count is very small we don't allow any
1714   /// iterations to execute in the scalar loop.
1715   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1716 
1717   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1718   bool FoldTailByMasking = false;
1719 
1720   /// A map holding scalar costs for different vectorization factors. The
1721   /// presence of a cost for an instruction in the mapping indicates that the
1722   /// instruction will be scalarized when vectorizing with the associated
1723   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1724   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1725 
1726   /// Holds the instructions known to be uniform after vectorization.
1727   /// The data is collected per VF.
1728   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1729 
1730   /// Holds the instructions known to be scalar after vectorization.
1731   /// The data is collected per VF.
1732   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1733 
1734   /// Holds the instructions (address computations) that are forced to be
1735   /// scalarized.
1736   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1737 
1738   /// PHINodes of the reductions that should be expanded in-loop along with
1739   /// their associated chains of reduction operations, in program order from top
1740   /// (PHI) to bottom
1741   ReductionChainMap InLoopReductionChains;
1742 
1743   /// A Map of inloop reduction operations and their immediate chain operand.
1744   /// FIXME: This can be removed once reductions can be costed correctly in
1745   /// vplan. This was added to allow quick lookup to the inloop operations,
1746   /// without having to loop through InLoopReductionChains.
1747   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1748 
1749   /// Returns the expected difference in cost from scalarizing the expression
1750   /// feeding a predicated instruction \p PredInst. The instructions to
1751   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1752   /// non-negative return value implies the expression will be scalarized.
1753   /// Currently, only single-use chains are considered for scalarization.
1754   InstructionCost computePredInstDiscount(Instruction *PredInst,
1755                                           ScalarCostsTy &ScalarCosts,
1756                                           ElementCount VF);
1757 
1758   /// Collect the instructions that are uniform after vectorization. An
1759   /// instruction is uniform if we represent it with a single scalar value in
1760   /// the vectorized loop corresponding to each vector iteration. Examples of
1761   /// uniform instructions include pointer operands of consecutive or
1762   /// interleaved memory accesses. Note that although uniformity implies an
1763   /// instruction will be scalar, the reverse is not true. In general, a
1764   /// scalarized instruction will be represented by VF scalar values in the
1765   /// vectorized loop, each corresponding to an iteration of the original
1766   /// scalar loop.
1767   void collectLoopUniforms(ElementCount VF);
1768 
1769   /// Collect the instructions that are scalar after vectorization. An
1770   /// instruction is scalar if it is known to be uniform or will be scalarized
1771   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1772   /// to the list if they are used by a load/store instruction that is marked as
1773   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1774   /// VF values in the vectorized loop, each corresponding to an iteration of
1775   /// the original scalar loop.
1776   void collectLoopScalars(ElementCount VF);
1777 
1778   /// Keeps cost model vectorization decision and cost for instructions.
1779   /// Right now it is used for memory instructions only.
1780   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1781                                 std::pair<InstWidening, InstructionCost>>;
1782 
1783   DecisionList WideningDecisions;
1784 
1785   /// Returns true if \p V is expected to be vectorized and it needs to be
1786   /// extracted.
1787   bool needsExtract(Value *V, ElementCount VF) const {
1788     Instruction *I = dyn_cast<Instruction>(V);
1789     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1790         TheLoop->isLoopInvariant(I))
1791       return false;
1792 
1793     // Assume we can vectorize V (and hence we need extraction) if the
1794     // scalars are not computed yet. This can happen, because it is called
1795     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1796     // the scalars are collected. That should be a safe assumption in most
1797     // cases, because we check if the operands have vectorizable types
1798     // beforehand in LoopVectorizationLegality.
1799     return Scalars.find(VF) == Scalars.end() ||
1800            !isScalarAfterVectorization(I, VF);
1801   };
1802 
1803   /// Returns a range containing only operands needing to be extracted.
1804   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1805                                                    ElementCount VF) const {
1806     return SmallVector<Value *, 4>(make_filter_range(
1807         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1808   }
1809 
1810   /// Determines if we have the infrastructure to vectorize loop \p L and its
1811   /// epilogue, assuming the main loop is vectorized by \p VF.
1812   bool isCandidateForEpilogueVectorization(const Loop &L,
1813                                            const ElementCount VF) const;
1814 
1815   /// Returns true if epilogue vectorization is considered profitable, and
1816   /// false otherwise.
1817   /// \p VF is the vectorization factor chosen for the original loop.
1818   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1819 
1820 public:
1821   /// The loop that we evaluate.
1822   Loop *TheLoop;
1823 
1824   /// Predicated scalar evolution analysis.
1825   PredicatedScalarEvolution &PSE;
1826 
1827   /// Loop Info analysis.
1828   LoopInfo *LI;
1829 
1830   /// Vectorization legality.
1831   LoopVectorizationLegality *Legal;
1832 
1833   /// Vector target information.
1834   const TargetTransformInfo &TTI;
1835 
1836   /// Target Library Info.
1837   const TargetLibraryInfo *TLI;
1838 
1839   /// Demanded bits analysis.
1840   DemandedBits *DB;
1841 
1842   /// Assumption cache.
1843   AssumptionCache *AC;
1844 
1845   /// Interface to emit optimization remarks.
1846   OptimizationRemarkEmitter *ORE;
1847 
1848   const Function *TheFunction;
1849 
1850   /// Loop Vectorize Hint.
1851   const LoopVectorizeHints *Hints;
1852 
1853   /// The interleave access information contains groups of interleaved accesses
1854   /// with the same stride and close to each other.
1855   InterleavedAccessInfo &InterleaveInfo;
1856 
1857   /// Values to ignore in the cost model.
1858   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1859 
1860   /// Values to ignore in the cost model when VF > 1.
1861   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1862 
1863   /// All element types found in the loop.
1864   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1865 
1866   /// Profitable vector factors.
1867   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1868 };
1869 } // end namespace llvm
1870 
1871 namespace {
1872 /// Helper struct to manage generating runtime checks for vectorization.
1873 ///
1874 /// The runtime checks are created up-front in temporary blocks to allow better
1875 /// estimating the cost and un-linked from the existing IR. After deciding to
1876 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1877 /// temporary blocks are completely removed.
1878 class GeneratedRTChecks {
1879   /// Basic block which contains the generated SCEV checks, if any.
1880   BasicBlock *SCEVCheckBlock = nullptr;
1881 
1882   /// The value representing the result of the generated SCEV checks. If it is
1883   /// nullptr, either no SCEV checks have been generated or they have been used.
1884   Value *SCEVCheckCond = nullptr;
1885 
1886   /// Basic block which contains the generated memory runtime checks, if any.
1887   BasicBlock *MemCheckBlock = nullptr;
1888 
1889   /// The value representing the result of the generated memory runtime checks.
1890   /// If it is nullptr, either no memory runtime checks have been generated or
1891   /// they have been used.
1892   Value *MemRuntimeCheckCond = nullptr;
1893 
1894   DominatorTree *DT;
1895   LoopInfo *LI;
1896   TargetTransformInfo *TTI;
1897 
1898   SCEVExpander SCEVExp;
1899   SCEVExpander MemCheckExp;
1900 
1901   bool CostTooHigh = false;
1902 
1903 public:
1904   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1905                     TargetTransformInfo *TTI, const DataLayout &DL)
1906       : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1907         MemCheckExp(SE, DL, "scev.check") {}
1908 
1909   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1910   /// accurately estimate the cost of the runtime checks. The blocks are
1911   /// un-linked from the IR and is added back during vector code generation. If
1912   /// there is no vector code generation, the check blocks are removed
1913   /// completely.
1914   void Create(Loop *L, const LoopAccessInfo &LAI,
1915               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1916 
1917     // Hard cutoff to limit compile-time increase in case a very large number of
1918     // runtime checks needs to be generated.
1919     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1920     // profile info.
1921     CostTooHigh =
1922         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1923     if (CostTooHigh)
1924       return;
1925 
1926     BasicBlock *LoopHeader = L->getHeader();
1927     BasicBlock *Preheader = L->getLoopPreheader();
1928 
1929     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1930     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1931     // may be used by SCEVExpander. The blocks will be un-linked from their
1932     // predecessors and removed from LI & DT at the end of the function.
1933     if (!UnionPred.isAlwaysTrue()) {
1934       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1935                                   nullptr, "vector.scevcheck");
1936 
1937       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1938           &UnionPred, SCEVCheckBlock->getTerminator());
1939     }
1940 
1941     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1942     if (RtPtrChecking.Need) {
1943       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1944       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1945                                  "vector.memcheck");
1946 
1947       auto DiffChecks = RtPtrChecking.getDiffChecks();
1948       if (DiffChecks) {
1949         Value *RuntimeVF = nullptr;
1950         MemRuntimeCheckCond = addDiffRuntimeChecks(
1951             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
1952             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1953               if (!RuntimeVF)
1954                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1955               return RuntimeVF;
1956             },
1957             IC);
1958       } else {
1959         MemRuntimeCheckCond =
1960             addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1961                              RtPtrChecking.getChecks(), MemCheckExp);
1962       }
1963       assert(MemRuntimeCheckCond &&
1964              "no RT checks generated although RtPtrChecking "
1965              "claimed checks are required");
1966     }
1967 
1968     if (!MemCheckBlock && !SCEVCheckBlock)
1969       return;
1970 
1971     // Unhook the temporary block with the checks, update various places
1972     // accordingly.
1973     if (SCEVCheckBlock)
1974       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1975     if (MemCheckBlock)
1976       MemCheckBlock->replaceAllUsesWith(Preheader);
1977 
1978     if (SCEVCheckBlock) {
1979       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1980       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1981       Preheader->getTerminator()->eraseFromParent();
1982     }
1983     if (MemCheckBlock) {
1984       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1985       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1986       Preheader->getTerminator()->eraseFromParent();
1987     }
1988 
1989     DT->changeImmediateDominator(LoopHeader, Preheader);
1990     if (MemCheckBlock) {
1991       DT->eraseNode(MemCheckBlock);
1992       LI->removeBlock(MemCheckBlock);
1993     }
1994     if (SCEVCheckBlock) {
1995       DT->eraseNode(SCEVCheckBlock);
1996       LI->removeBlock(SCEVCheckBlock);
1997     }
1998   }
1999 
2000   InstructionCost getCost() {
2001     if (SCEVCheckBlock || MemCheckBlock)
2002       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2003 
2004     if (CostTooHigh) {
2005       InstructionCost Cost;
2006       Cost.setInvalid();
2007       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
2008       return Cost;
2009     }
2010 
2011     InstructionCost RTCheckCost = 0;
2012     if (SCEVCheckBlock)
2013       for (Instruction &I : *SCEVCheckBlock) {
2014         if (SCEVCheckBlock->getTerminator() == &I)
2015           continue;
2016         InstructionCost C =
2017             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2018         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2019         RTCheckCost += C;
2020       }
2021     if (MemCheckBlock)
2022       for (Instruction &I : *MemCheckBlock) {
2023         if (MemCheckBlock->getTerminator() == &I)
2024           continue;
2025         InstructionCost C =
2026             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
2027         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
2028         RTCheckCost += C;
2029       }
2030 
2031     if (SCEVCheckBlock || MemCheckBlock)
2032       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2033                         << "\n");
2034 
2035     return RTCheckCost;
2036   }
2037 
2038   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2039   /// unused.
2040   ~GeneratedRTChecks() {
2041     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2042     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2043     if (!SCEVCheckCond)
2044       SCEVCleaner.markResultUsed();
2045 
2046     if (!MemRuntimeCheckCond)
2047       MemCheckCleaner.markResultUsed();
2048 
2049     if (MemRuntimeCheckCond) {
2050       auto &SE = *MemCheckExp.getSE();
2051       // Memory runtime check generation creates compares that use expanded
2052       // values. Remove them before running the SCEVExpanderCleaners.
2053       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2054         if (MemCheckExp.isInsertedInstruction(&I))
2055           continue;
2056         SE.forgetValue(&I);
2057         I.eraseFromParent();
2058       }
2059     }
2060     MemCheckCleaner.cleanup();
2061     SCEVCleaner.cleanup();
2062 
2063     if (SCEVCheckCond)
2064       SCEVCheckBlock->eraseFromParent();
2065     if (MemRuntimeCheckCond)
2066       MemCheckBlock->eraseFromParent();
2067   }
2068 
2069   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2070   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2071   /// depending on the generated condition.
2072   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2073                              BasicBlock *LoopVectorPreHeader,
2074                              BasicBlock *LoopExitBlock) {
2075     if (!SCEVCheckCond)
2076       return nullptr;
2077 
2078     Value *Cond = SCEVCheckCond;
2079     // Mark the check as used, to prevent it from being removed during cleanup.
2080     SCEVCheckCond = nullptr;
2081     if (auto *C = dyn_cast<ConstantInt>(Cond))
2082       if (C->isZero())
2083         return nullptr;
2084 
2085     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2086 
2087     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2088     // Create new preheader for vector loop.
2089     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2090       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2091 
2092     SCEVCheckBlock->getTerminator()->eraseFromParent();
2093     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2094     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2095                                                 SCEVCheckBlock);
2096 
2097     DT->addNewBlock(SCEVCheckBlock, Pred);
2098     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2099 
2100     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2101                         BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2102     return SCEVCheckBlock;
2103   }
2104 
2105   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2106   /// the branches to branch to the vector preheader or \p Bypass, depending on
2107   /// the generated condition.
2108   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2109                                    BasicBlock *LoopVectorPreHeader) {
2110     // Check if we generated code that checks in runtime if arrays overlap.
2111     if (!MemRuntimeCheckCond)
2112       return nullptr;
2113 
2114     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2115     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2116                                                 MemCheckBlock);
2117 
2118     DT->addNewBlock(MemCheckBlock, Pred);
2119     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2120     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2121 
2122     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2123       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2124 
2125     ReplaceInstWithInst(
2126         MemCheckBlock->getTerminator(),
2127         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2128     MemCheckBlock->getTerminator()->setDebugLoc(
2129         Pred->getTerminator()->getDebugLoc());
2130 
2131     // Mark the check as used, to prevent it from being removed during cleanup.
2132     MemRuntimeCheckCond = nullptr;
2133     return MemCheckBlock;
2134   }
2135 };
2136 } // namespace
2137 
2138 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2139 // vectorization. The loop needs to be annotated with #pragma omp simd
2140 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2141 // vector length information is not provided, vectorization is not considered
2142 // explicit. Interleave hints are not allowed either. These limitations will be
2143 // relaxed in the future.
2144 // Please, note that we are currently forced to abuse the pragma 'clang
2145 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2146 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2147 // provides *explicit vectorization hints* (LV can bypass legal checks and
2148 // assume that vectorization is legal). However, both hints are implemented
2149 // using the same metadata (llvm.loop.vectorize, processed by
2150 // LoopVectorizeHints). This will be fixed in the future when the native IR
2151 // representation for pragma 'omp simd' is introduced.
2152 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2153                                    OptimizationRemarkEmitter *ORE) {
2154   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2155   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2156 
2157   // Only outer loops with an explicit vectorization hint are supported.
2158   // Unannotated outer loops are ignored.
2159   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2160     return false;
2161 
2162   Function *Fn = OuterLp->getHeader()->getParent();
2163   if (!Hints.allowVectorization(Fn, OuterLp,
2164                                 true /*VectorizeOnlyWhenForced*/)) {
2165     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2166     return false;
2167   }
2168 
2169   if (Hints.getInterleave() > 1) {
2170     // TODO: Interleave support is future work.
2171     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2172                          "outer loops.\n");
2173     Hints.emitRemarkWithHints();
2174     return false;
2175   }
2176 
2177   return true;
2178 }
2179 
2180 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2181                                   OptimizationRemarkEmitter *ORE,
2182                                   SmallVectorImpl<Loop *> &V) {
2183   // Collect inner loops and outer loops without irreducible control flow. For
2184   // now, only collect outer loops that have explicit vectorization hints. If we
2185   // are stress testing the VPlan H-CFG construction, we collect the outermost
2186   // loop of every loop nest.
2187   if (L.isInnermost() || VPlanBuildStressTest ||
2188       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2189     LoopBlocksRPO RPOT(&L);
2190     RPOT.perform(LI);
2191     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2192       V.push_back(&L);
2193       // TODO: Collect inner loops inside marked outer loops in case
2194       // vectorization fails for the outer loop. Do not invoke
2195       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2196       // already known to be reducible. We can use an inherited attribute for
2197       // that.
2198       return;
2199     }
2200   }
2201   for (Loop *InnerL : L)
2202     collectSupportedLoops(*InnerL, LI, ORE, V);
2203 }
2204 
2205 namespace {
2206 
2207 /// The LoopVectorize Pass.
2208 struct LoopVectorize : public FunctionPass {
2209   /// Pass identification, replacement for typeid
2210   static char ID;
2211 
2212   LoopVectorizePass Impl;
2213 
2214   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2215                          bool VectorizeOnlyWhenForced = false)
2216       : FunctionPass(ID),
2217         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2218     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2219   }
2220 
2221   bool runOnFunction(Function &F) override {
2222     if (skipFunction(F))
2223       return false;
2224 
2225     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2226     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2227     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2228     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2229     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2230     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2231     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2232     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2233     auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs();
2234     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2235     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2236     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2237 
2238     return Impl
2239         .runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AC, LAIs, *ORE, PSI)
2240         .MadeAnyChange;
2241   }
2242 
2243   void getAnalysisUsage(AnalysisUsage &AU) const override {
2244     AU.addRequired<AssumptionCacheTracker>();
2245     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2246     AU.addRequired<DominatorTreeWrapperPass>();
2247     AU.addRequired<LoopInfoWrapperPass>();
2248     AU.addRequired<ScalarEvolutionWrapperPass>();
2249     AU.addRequired<TargetTransformInfoWrapperPass>();
2250     AU.addRequired<LoopAccessLegacyAnalysis>();
2251     AU.addRequired<DemandedBitsWrapperPass>();
2252     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2253     AU.addRequired<InjectTLIMappingsLegacy>();
2254 
2255     // We currently do not preserve loopinfo/dominator analyses with outer loop
2256     // vectorization. Until this is addressed, mark these analyses as preserved
2257     // only for non-VPlan-native path.
2258     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2259     if (!EnableVPlanNativePath) {
2260       AU.addPreserved<LoopInfoWrapperPass>();
2261       AU.addPreserved<DominatorTreeWrapperPass>();
2262     }
2263 
2264     AU.addPreserved<BasicAAWrapperPass>();
2265     AU.addPreserved<GlobalsAAWrapperPass>();
2266     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2267   }
2268 };
2269 
2270 } // end anonymous namespace
2271 
2272 //===----------------------------------------------------------------------===//
2273 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2274 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2275 //===----------------------------------------------------------------------===//
2276 
2277 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2278   // We need to place the broadcast of invariant variables outside the loop,
2279   // but only if it's proven safe to do so. Else, broadcast will be inside
2280   // vector loop body.
2281   Instruction *Instr = dyn_cast<Instruction>(V);
2282   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2283                      (!Instr ||
2284                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2285   // Place the code for broadcasting invariant variables in the new preheader.
2286   IRBuilder<>::InsertPointGuard Guard(Builder);
2287   if (SafeToHoist)
2288     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2289 
2290   // Broadcast the scalar into all locations in the vector.
2291   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2292 
2293   return Shuf;
2294 }
2295 
2296 /// This function adds
2297 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2298 /// to each vector element of Val. The sequence starts at StartIndex.
2299 /// \p Opcode is relevant for FP induction variable.
2300 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2301                             Instruction::BinaryOps BinOp, ElementCount VF,
2302                             IRBuilderBase &Builder) {
2303   assert(VF.isVector() && "only vector VFs are supported");
2304 
2305   // Create and check the types.
2306   auto *ValVTy = cast<VectorType>(Val->getType());
2307   ElementCount VLen = ValVTy->getElementCount();
2308 
2309   Type *STy = Val->getType()->getScalarType();
2310   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2311          "Induction Step must be an integer or FP");
2312   assert(Step->getType() == STy && "Step has wrong type");
2313 
2314   SmallVector<Constant *, 8> Indices;
2315 
2316   // Create a vector of consecutive numbers from zero to VF.
2317   VectorType *InitVecValVTy = ValVTy;
2318   if (STy->isFloatingPointTy()) {
2319     Type *InitVecValSTy =
2320         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2321     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2322   }
2323   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2324 
2325   // Splat the StartIdx
2326   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2327 
2328   if (STy->isIntegerTy()) {
2329     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2330     Step = Builder.CreateVectorSplat(VLen, Step);
2331     assert(Step->getType() == Val->getType() && "Invalid step vec");
2332     // FIXME: The newly created binary instructions should contain nsw/nuw
2333     // flags, which can be found from the original scalar operations.
2334     Step = Builder.CreateMul(InitVec, Step);
2335     return Builder.CreateAdd(Val, Step, "induction");
2336   }
2337 
2338   // Floating point induction.
2339   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2340          "Binary Opcode should be specified for FP induction");
2341   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2342   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2343 
2344   Step = Builder.CreateVectorSplat(VLen, Step);
2345   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2346   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2347 }
2348 
2349 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2350 /// variable on which to base the steps, \p Step is the size of the step.
2351 static void buildScalarSteps(Value *ScalarIV, Value *Step,
2352                              const InductionDescriptor &ID, VPValue *Def,
2353                              VPTransformState &State) {
2354   IRBuilderBase &Builder = State.Builder;
2355 
2356   // Ensure step has the same type as that of scalar IV.
2357   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2358   if (ScalarIVTy != Step->getType()) {
2359     // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
2360     // avoid separate truncate here.
2361     assert(Step->getType()->isIntegerTy() &&
2362            "Truncation requires an integer step");
2363     Step = State.Builder.CreateTrunc(Step, ScalarIVTy);
2364   }
2365 
2366   // We build scalar steps for both integer and floating-point induction
2367   // variables. Here, we determine the kind of arithmetic we will perform.
2368   Instruction::BinaryOps AddOp;
2369   Instruction::BinaryOps MulOp;
2370   if (ScalarIVTy->isIntegerTy()) {
2371     AddOp = Instruction::Add;
2372     MulOp = Instruction::Mul;
2373   } else {
2374     AddOp = ID.getInductionOpcode();
2375     MulOp = Instruction::FMul;
2376   }
2377 
2378   // Determine the number of scalars we need to generate for each unroll
2379   // iteration.
2380   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2381   // Compute the scalar steps and save the results in State.
2382   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2383                                      ScalarIVTy->getScalarSizeInBits());
2384   Type *VecIVTy = nullptr;
2385   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2386   if (!FirstLaneOnly && State.VF.isScalable()) {
2387     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2388     UnitStepVec =
2389         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2390     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2391     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2392   }
2393 
2394   unsigned StartPart = 0;
2395   unsigned EndPart = State.UF;
2396   unsigned StartLane = 0;
2397   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2398   if (State.Instance) {
2399     StartPart = State.Instance->Part;
2400     EndPart = StartPart + 1;
2401     StartLane = State.Instance->Lane.getKnownLane();
2402     EndLane = StartLane + 1;
2403   }
2404   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
2405     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2406 
2407     if (!FirstLaneOnly && State.VF.isScalable()) {
2408       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2409       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2410       if (ScalarIVTy->isFloatingPointTy())
2411         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2412       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2413       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2414       State.set(Def, Add, Part);
2415       // It's useful to record the lane values too for the known minimum number
2416       // of elements so we do those below. This improves the code quality when
2417       // trying to extract the first element, for example.
2418     }
2419 
2420     if (ScalarIVTy->isFloatingPointTy())
2421       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2422 
2423     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2424       Value *StartIdx = Builder.CreateBinOp(
2425           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2426       // The step returned by `createStepForVF` is a runtime-evaluated value
2427       // when VF is scalable. Otherwise, it should be folded into a Constant.
2428       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2429              "Expected StartIdx to be folded to a constant when VF is not "
2430              "scalable");
2431       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2432       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2433       State.set(Def, Add, VPIteration(Part, Lane));
2434     }
2435   }
2436 }
2437 
2438 // Generate code for the induction step. Note that induction steps are
2439 // required to be loop-invariant
2440 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2441                               Instruction *InsertBefore,
2442                               Loop *OrigLoop = nullptr) {
2443   const DataLayout &DL = SE.getDataLayout();
2444   assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
2445          "Induction step should be loop invariant");
2446   if (auto *E = dyn_cast<SCEVUnknown>(Step))
2447     return E->getValue();
2448 
2449   SCEVExpander Exp(SE, DL, "induction");
2450   return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2451 }
2452 
2453 /// Compute the transformed value of Index at offset StartValue using step
2454 /// StepValue.
2455 /// For integer induction, returns StartValue + Index * StepValue.
2456 /// For pointer induction, returns StartValue[Index * StepValue].
2457 /// FIXME: The newly created binary instructions should contain nsw/nuw
2458 /// flags, which can be found from the original scalar operations.
2459 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2460                                    Value *StartValue, Value *Step,
2461                                    const InductionDescriptor &ID) {
2462   Type *StepTy = Step->getType();
2463   Value *CastedIndex = StepTy->isIntegerTy()
2464                            ? B.CreateSExtOrTrunc(Index, StepTy)
2465                            : B.CreateCast(Instruction::SIToFP, Index, StepTy);
2466   if (CastedIndex != Index) {
2467     CastedIndex->setName(CastedIndex->getName() + ".cast");
2468     Index = CastedIndex;
2469   }
2470 
2471   // Note: the IR at this point is broken. We cannot use SE to create any new
2472   // SCEV and then expand it, hoping that SCEV's simplification will give us
2473   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2474   // lead to various SCEV crashes. So all we can do is to use builder and rely
2475   // on InstCombine for future simplifications. Here we handle some trivial
2476   // cases only.
2477   auto CreateAdd = [&B](Value *X, Value *Y) {
2478     assert(X->getType() == Y->getType() && "Types don't match!");
2479     if (auto *CX = dyn_cast<ConstantInt>(X))
2480       if (CX->isZero())
2481         return Y;
2482     if (auto *CY = dyn_cast<ConstantInt>(Y))
2483       if (CY->isZero())
2484         return X;
2485     return B.CreateAdd(X, Y);
2486   };
2487 
2488   // We allow X to be a vector type, in which case Y will potentially be
2489   // splatted into a vector with the same element count.
2490   auto CreateMul = [&B](Value *X, Value *Y) {
2491     assert(X->getType()->getScalarType() == Y->getType() &&
2492            "Types don't match!");
2493     if (auto *CX = dyn_cast<ConstantInt>(X))
2494       if (CX->isOne())
2495         return Y;
2496     if (auto *CY = dyn_cast<ConstantInt>(Y))
2497       if (CY->isOne())
2498         return X;
2499     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2500     if (XVTy && !isa<VectorType>(Y->getType()))
2501       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2502     return B.CreateMul(X, Y);
2503   };
2504 
2505   switch (ID.getKind()) {
2506   case InductionDescriptor::IK_IntInduction: {
2507     assert(!isa<VectorType>(Index->getType()) &&
2508            "Vector indices not supported for integer inductions yet");
2509     assert(Index->getType() == StartValue->getType() &&
2510            "Index type does not match StartValue type");
2511     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2512       return B.CreateSub(StartValue, Index);
2513     auto *Offset = CreateMul(Index, Step);
2514     return CreateAdd(StartValue, Offset);
2515   }
2516   case InductionDescriptor::IK_PtrInduction: {
2517     assert(isa<Constant>(Step) &&
2518            "Expected constant step for pointer induction");
2519     return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2520   }
2521   case InductionDescriptor::IK_FpInduction: {
2522     assert(!isa<VectorType>(Index->getType()) &&
2523            "Vector indices not supported for FP inductions yet");
2524     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2525     auto InductionBinOp = ID.getInductionBinOp();
2526     assert(InductionBinOp &&
2527            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2528             InductionBinOp->getOpcode() == Instruction::FSub) &&
2529            "Original bin op should be defined for FP induction");
2530 
2531     Value *MulExp = B.CreateFMul(Step, Index);
2532     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2533                          "induction");
2534   }
2535   case InductionDescriptor::IK_NoInduction:
2536     return nullptr;
2537   }
2538   llvm_unreachable("invalid enum");
2539 }
2540 
2541 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2542                                                     const VPIteration &Instance,
2543                                                     VPTransformState &State) {
2544   Value *ScalarInst = State.get(Def, Instance);
2545   Value *VectorValue = State.get(Def, Instance.Part);
2546   VectorValue = Builder.CreateInsertElement(
2547       VectorValue, ScalarInst,
2548       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2549   State.set(Def, VectorValue, Instance.Part);
2550 }
2551 
2552 // Return whether we allow using masked interleave-groups (for dealing with
2553 // strided loads/stores that reside in predicated blocks, or for dealing
2554 // with gaps).
2555 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2556   // If an override option has been passed in for interleaved accesses, use it.
2557   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2558     return EnableMaskedInterleavedMemAccesses;
2559 
2560   return TTI.enableMaskedInterleavedAccessVectorization();
2561 }
2562 
2563 // Try to vectorize the interleave group that \p Instr belongs to.
2564 //
2565 // E.g. Translate following interleaved load group (factor = 3):
2566 //   for (i = 0; i < N; i+=3) {
2567 //     R = Pic[i];             // Member of index 0
2568 //     G = Pic[i+1];           // Member of index 1
2569 //     B = Pic[i+2];           // Member of index 2
2570 //     ... // do something to R, G, B
2571 //   }
2572 // To:
2573 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2574 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2575 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2576 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2577 //
2578 // Or translate following interleaved store group (factor = 3):
2579 //   for (i = 0; i < N; i+=3) {
2580 //     ... do something to R, G, B
2581 //     Pic[i]   = R;           // Member of index 0
2582 //     Pic[i+1] = G;           // Member of index 1
2583 //     Pic[i+2] = B;           // Member of index 2
2584 //   }
2585 // To:
2586 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2587 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2588 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2589 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2590 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2591 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2592     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2593     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2594     VPValue *BlockInMask) {
2595   Instruction *Instr = Group->getInsertPos();
2596   const DataLayout &DL = Instr->getModule()->getDataLayout();
2597 
2598   // Prepare for the vector type of the interleaved load/store.
2599   Type *ScalarTy = getLoadStoreType(Instr);
2600   unsigned InterleaveFactor = Group->getFactor();
2601   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2602   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2603 
2604   // Prepare for the new pointers.
2605   SmallVector<Value *, 2> AddrParts;
2606   unsigned Index = Group->getIndex(Instr);
2607 
2608   // TODO: extend the masked interleaved-group support to reversed access.
2609   assert((!BlockInMask || !Group->isReverse()) &&
2610          "Reversed masked interleave-group not supported.");
2611 
2612   // If the group is reverse, adjust the index to refer to the last vector lane
2613   // instead of the first. We adjust the index from the first vector lane,
2614   // rather than directly getting the pointer for lane VF - 1, because the
2615   // pointer operand of the interleaved access is supposed to be uniform. For
2616   // uniform instructions, we're only required to generate a value for the
2617   // first vector lane in each unroll iteration.
2618   if (Group->isReverse())
2619     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2620 
2621   for (unsigned Part = 0; Part < UF; Part++) {
2622     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2623     State.setDebugLocFromInst(AddrPart);
2624 
2625     // Notice current instruction could be any index. Need to adjust the address
2626     // to the member of index 0.
2627     //
2628     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2629     //       b = A[i];       // Member of index 0
2630     // Current pointer is pointed to A[i+1], adjust it to A[i].
2631     //
2632     // E.g.  A[i+1] = a;     // Member of index 1
2633     //       A[i]   = b;     // Member of index 0
2634     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2635     // Current pointer is pointed to A[i+2], adjust it to A[i].
2636 
2637     bool InBounds = false;
2638     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2639       InBounds = gep->isInBounds();
2640     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2641     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2642 
2643     // Cast to the vector pointer type.
2644     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2645     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2646     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2647   }
2648 
2649   State.setDebugLocFromInst(Instr);
2650   Value *PoisonVec = PoisonValue::get(VecTy);
2651 
2652   Value *MaskForGaps = nullptr;
2653   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2654     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2655     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2656   }
2657 
2658   // Vectorize the interleaved load group.
2659   if (isa<LoadInst>(Instr)) {
2660     // For each unroll part, create a wide load for the group.
2661     SmallVector<Value *, 2> NewLoads;
2662     for (unsigned Part = 0; Part < UF; Part++) {
2663       Instruction *NewLoad;
2664       if (BlockInMask || MaskForGaps) {
2665         assert(useMaskedInterleavedAccesses(*TTI) &&
2666                "masked interleaved groups are not allowed.");
2667         Value *GroupMask = MaskForGaps;
2668         if (BlockInMask) {
2669           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2670           Value *ShuffledMask = Builder.CreateShuffleVector(
2671               BlockInMaskPart,
2672               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2673               "interleaved.mask");
2674           GroupMask = MaskForGaps
2675                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2676                                                 MaskForGaps)
2677                           : ShuffledMask;
2678         }
2679         NewLoad =
2680             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2681                                      GroupMask, PoisonVec, "wide.masked.vec");
2682       }
2683       else
2684         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2685                                             Group->getAlign(), "wide.vec");
2686       Group->addMetadata(NewLoad);
2687       NewLoads.push_back(NewLoad);
2688     }
2689 
2690     // For each member in the group, shuffle out the appropriate data from the
2691     // wide loads.
2692     unsigned J = 0;
2693     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2694       Instruction *Member = Group->getMember(I);
2695 
2696       // Skip the gaps in the group.
2697       if (!Member)
2698         continue;
2699 
2700       auto StrideMask =
2701           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2702       for (unsigned Part = 0; Part < UF; Part++) {
2703         Value *StridedVec = Builder.CreateShuffleVector(
2704             NewLoads[Part], StrideMask, "strided.vec");
2705 
2706         // If this member has different type, cast the result type.
2707         if (Member->getType() != ScalarTy) {
2708           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2709           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2710           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2711         }
2712 
2713         if (Group->isReverse())
2714           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2715 
2716         State.set(VPDefs[J], StridedVec, Part);
2717       }
2718       ++J;
2719     }
2720     return;
2721   }
2722 
2723   // The sub vector type for current instruction.
2724   auto *SubVT = VectorType::get(ScalarTy, VF);
2725 
2726   // Vectorize the interleaved store group.
2727   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2728   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2729          "masked interleaved groups are not allowed.");
2730   assert((!MaskForGaps || !VF.isScalable()) &&
2731          "masking gaps for scalable vectors is not yet supported.");
2732   for (unsigned Part = 0; Part < UF; Part++) {
2733     // Collect the stored vector from each member.
2734     SmallVector<Value *, 4> StoredVecs;
2735     unsigned StoredIdx = 0;
2736     for (unsigned i = 0; i < InterleaveFactor; i++) {
2737       assert((Group->getMember(i) || MaskForGaps) &&
2738              "Fail to get a member from an interleaved store group");
2739       Instruction *Member = Group->getMember(i);
2740 
2741       // Skip the gaps in the group.
2742       if (!Member) {
2743         Value *Undef = PoisonValue::get(SubVT);
2744         StoredVecs.push_back(Undef);
2745         continue;
2746       }
2747 
2748       Value *StoredVec = State.get(StoredValues[StoredIdx], Part);
2749       ++StoredIdx;
2750 
2751       if (Group->isReverse())
2752         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2753 
2754       // If this member has different type, cast it to a unified type.
2755 
2756       if (StoredVec->getType() != SubVT)
2757         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2758 
2759       StoredVecs.push_back(StoredVec);
2760     }
2761 
2762     // Concatenate all vectors into a wide vector.
2763     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2764 
2765     // Interleave the elements in the wide vector.
2766     Value *IVec = Builder.CreateShuffleVector(
2767         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2768         "interleaved.vec");
2769 
2770     Instruction *NewStoreInstr;
2771     if (BlockInMask || MaskForGaps) {
2772       Value *GroupMask = MaskForGaps;
2773       if (BlockInMask) {
2774         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2775         Value *ShuffledMask = Builder.CreateShuffleVector(
2776             BlockInMaskPart,
2777             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2778             "interleaved.mask");
2779         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2780                                                       ShuffledMask, MaskForGaps)
2781                                 : ShuffledMask;
2782       }
2783       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2784                                                 Group->getAlign(), GroupMask);
2785     } else
2786       NewStoreInstr =
2787           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2788 
2789     Group->addMetadata(NewStoreInstr);
2790   }
2791 }
2792 
2793 void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2794                                                VPReplicateRecipe *RepRecipe,
2795                                                const VPIteration &Instance,
2796                                                bool IfPredicateInstr,
2797                                                VPTransformState &State) {
2798   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2799 
2800   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2801   // the first lane and part.
2802   if (isa<NoAliasScopeDeclInst>(Instr))
2803     if (!Instance.isFirstIteration())
2804       return;
2805 
2806   // Does this instruction return a value ?
2807   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2808 
2809   Instruction *Cloned = Instr->clone();
2810   if (!IsVoidRetTy)
2811     Cloned->setName(Instr->getName() + ".cloned");
2812 
2813   // If the scalarized instruction contributes to the address computation of a
2814   // widen masked load/store which was in a basic block that needed predication
2815   // and is not predicated after vectorization, we can't propagate
2816   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2817   // instruction could feed a poison value to the base address of the widen
2818   // load/store.
2819   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2820     Cloned->dropPoisonGeneratingFlags();
2821 
2822   if (Instr->getDebugLoc())
2823     State.setDebugLocFromInst(Instr);
2824 
2825   // Replace the operands of the cloned instructions with their scalar
2826   // equivalents in the new loop.
2827   for (const auto &I : enumerate(RepRecipe->operands())) {
2828     auto InputInstance = Instance;
2829     VPValue *Operand = I.value();
2830     if (vputils::isUniformAfterVectorization(Operand))
2831       InputInstance.Lane = VPLane::getFirstLane();
2832     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2833   }
2834   State.addNewMetadata(Cloned, Instr);
2835 
2836   // Place the cloned scalar in the new loop.
2837   State.Builder.Insert(Cloned);
2838 
2839   State.set(RepRecipe, Cloned, Instance);
2840 
2841   // If we just cloned a new assumption, add it the assumption cache.
2842   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2843     AC->registerAssumption(II);
2844 
2845   // End if-block.
2846   if (IfPredicateInstr)
2847     PredicatedInstructions.push_back(Cloned);
2848 }
2849 
2850 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2851   if (TripCount)
2852     return TripCount;
2853 
2854   assert(InsertBlock);
2855   IRBuilder<> Builder(InsertBlock->getTerminator());
2856   // Find the loop boundaries.
2857   Type *IdxTy = Legal->getWidestInductionType();
2858   assert(IdxTy && "No type for induction");
2859   const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE);
2860 
2861   const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2862 
2863   // Expand the trip count and place the new instructions in the preheader.
2864   // Notice that the pre-header does not change, only the loop body.
2865   SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2866 
2867   // Count holds the overall loop count (N).
2868   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2869                                 InsertBlock->getTerminator());
2870 
2871   if (TripCount->getType()->isPointerTy())
2872     TripCount =
2873         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2874                                     InsertBlock->getTerminator());
2875 
2876   return TripCount;
2877 }
2878 
2879 Value *
2880 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2881   if (VectorTripCount)
2882     return VectorTripCount;
2883 
2884   Value *TC = getOrCreateTripCount(InsertBlock);
2885   IRBuilder<> Builder(InsertBlock->getTerminator());
2886 
2887   Type *Ty = TC->getType();
2888   // This is where we can make the step a runtime constant.
2889   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2890 
2891   // If the tail is to be folded by masking, round the number of iterations N
2892   // up to a multiple of Step instead of rounding down. This is done by first
2893   // adding Step-1 and then rounding down. Note that it's ok if this addition
2894   // overflows: the vector induction variable will eventually wrap to zero given
2895   // that it starts at zero and its Step is a power of two; the loop will then
2896   // exit, with the last early-exit vector comparison also producing all-true.
2897   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2898   // is accounted for in emitIterationCountCheck that adds an overflow check.
2899   if (Cost->foldTailByMasking()) {
2900     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2901            "VF*UF must be a power of 2 when folding tail by masking");
2902     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2903     TC = Builder.CreateAdd(
2904         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2905   }
2906 
2907   // Now we need to generate the expression for the part of the loop that the
2908   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2909   // iterations are not required for correctness, or N - Step, otherwise. Step
2910   // is equal to the vectorization factor (number of SIMD elements) times the
2911   // unroll factor (number of SIMD instructions).
2912   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2913 
2914   // There are cases where we *must* run at least one iteration in the remainder
2915   // loop.  See the cost model for when this can happen.  If the step evenly
2916   // divides the trip count, we set the remainder to be equal to the step. If
2917   // the step does not evenly divide the trip count, no adjustment is necessary
2918   // since there will already be scalar iterations. Note that the minimum
2919   // iterations check ensures that N >= Step.
2920   if (Cost->requiresScalarEpilogue(VF)) {
2921     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2922     R = Builder.CreateSelect(IsZero, Step, R);
2923   }
2924 
2925   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2926 
2927   return VectorTripCount;
2928 }
2929 
2930 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2931                                                    const DataLayout &DL) {
2932   // Verify that V is a vector type with same number of elements as DstVTy.
2933   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2934   unsigned VF = DstFVTy->getNumElements();
2935   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2936   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2937   Type *SrcElemTy = SrcVecTy->getElementType();
2938   Type *DstElemTy = DstFVTy->getElementType();
2939   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2940          "Vector elements must have same size");
2941 
2942   // Do a direct cast if element types are castable.
2943   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2944     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2945   }
2946   // V cannot be directly casted to desired vector type.
2947   // May happen when V is a floating point vector but DstVTy is a vector of
2948   // pointers or vice-versa. Handle this using a two-step bitcast using an
2949   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2950   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2951          "Only one type should be a pointer type");
2952   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2953          "Only one type should be a floating point type");
2954   Type *IntTy =
2955       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2956   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2957   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2958   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2959 }
2960 
2961 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2962   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2963   // Reuse existing vector loop preheader for TC checks.
2964   // Note that new preheader block is generated for vector loop.
2965   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2966   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2967 
2968   // Generate code to check if the loop's trip count is less than VF * UF, or
2969   // equal to it in case a scalar epilogue is required; this implies that the
2970   // vector trip count is zero. This check also covers the case where adding one
2971   // to the backedge-taken count overflowed leading to an incorrect trip count
2972   // of zero. In this case we will also jump to the scalar loop.
2973   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2974                                             : ICmpInst::ICMP_ULT;
2975 
2976   // If tail is to be folded, vector loop takes care of all iterations.
2977   Type *CountTy = Count->getType();
2978   Value *CheckMinIters = Builder.getFalse();
2979   auto CreateStep = [&]() -> Value * {
2980     // Create step with max(MinProTripCount, UF * VF).
2981     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2982       return createStepForVF(Builder, CountTy, VF, UF);
2983 
2984     Value *MinProfTC =
2985         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2986     if (!VF.isScalable())
2987       return MinProfTC;
2988     return Builder.CreateBinaryIntrinsic(
2989         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2990   };
2991 
2992   if (!Cost->foldTailByMasking())
2993     CheckMinIters =
2994         Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2995   else if (VF.isScalable()) {
2996     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2997     // an overflow to zero when updating induction variables and so an
2998     // additional overflow check is required before entering the vector loop.
2999 
3000     // Get the maximum unsigned value for the type.
3001     Value *MaxUIntTripCount =
3002         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
3003     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
3004 
3005     // Don't execute the vector loop if (UMax - n) < (VF * UF).
3006     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
3007   }
3008 
3009   // Create new preheader for vector loop.
3010   LoopVectorPreHeader =
3011       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3012                  "vector.ph");
3013 
3014   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3015                                DT->getNode(Bypass)->getIDom()) &&
3016          "TC check is expected to dominate Bypass");
3017 
3018   // Update dominator for Bypass & LoopExit (if needed).
3019   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3020   if (!Cost->requiresScalarEpilogue(VF))
3021     // If there is an epilogue which must run, there's no edge from the
3022     // middle block to exit blocks  and thus no need to update the immediate
3023     // dominator of the exit blocks.
3024     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3025 
3026   ReplaceInstWithInst(
3027       TCCheckBlock->getTerminator(),
3028       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3029   LoopBypassBlocks.push_back(TCCheckBlock);
3030 }
3031 
3032 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3033   BasicBlock *const SCEVCheckBlock =
3034       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3035   if (!SCEVCheckBlock)
3036     return nullptr;
3037 
3038   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3039            (OptForSizeBasedOnProfile &&
3040             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3041          "Cannot SCEV check stride or overflow when optimizing for size");
3042 
3043 
3044   // Update dominator only if this is first RT check.
3045   if (LoopBypassBlocks.empty()) {
3046     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3047     if (!Cost->requiresScalarEpilogue(VF))
3048       // If there is an epilogue which must run, there's no edge from the
3049       // middle block to exit blocks  and thus no need to update the immediate
3050       // dominator of the exit blocks.
3051       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3052   }
3053 
3054   LoopBypassBlocks.push_back(SCEVCheckBlock);
3055   AddedSafetyChecks = true;
3056   return SCEVCheckBlock;
3057 }
3058 
3059 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3060   // VPlan-native path does not do any analysis for runtime checks currently.
3061   if (EnableVPlanNativePath)
3062     return nullptr;
3063 
3064   BasicBlock *const MemCheckBlock =
3065       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3066 
3067   // Check if we generated code that checks in runtime if arrays overlap. We put
3068   // the checks into a separate block to make the more common case of few
3069   // elements faster.
3070   if (!MemCheckBlock)
3071     return nullptr;
3072 
3073   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3074     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3075            "Cannot emit memory checks when optimizing for size, unless forced "
3076            "to vectorize.");
3077     ORE->emit([&]() {
3078       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3079                                         OrigLoop->getStartLoc(),
3080                                         OrigLoop->getHeader())
3081              << "Code-size may be reduced by not forcing "
3082                 "vectorization, or by source-code modifications "
3083                 "eliminating the need for runtime checks "
3084                 "(e.g., adding 'restrict').";
3085     });
3086   }
3087 
3088   LoopBypassBlocks.push_back(MemCheckBlock);
3089 
3090   AddedSafetyChecks = true;
3091 
3092   return MemCheckBlock;
3093 }
3094 
3095 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3096   LoopScalarBody = OrigLoop->getHeader();
3097   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3098   assert(LoopVectorPreHeader && "Invalid loop structure");
3099   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3100   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3101          "multiple exit loop without required epilogue?");
3102 
3103   LoopMiddleBlock =
3104       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3105                  LI, nullptr, Twine(Prefix) + "middle.block");
3106   LoopScalarPreHeader =
3107       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3108                  nullptr, Twine(Prefix) + "scalar.ph");
3109 
3110   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3111 
3112   // Set up the middle block terminator.  Two cases:
3113   // 1) If we know that we must execute the scalar epilogue, emit an
3114   //    unconditional branch.
3115   // 2) Otherwise, we must have a single unique exit block (due to how we
3116   //    implement the multiple exit case).  In this case, set up a conditional
3117   //    branch from the middle block to the loop scalar preheader, and the
3118   //    exit block.  completeLoopSkeleton will update the condition to use an
3119   //    iteration check, if required to decide whether to execute the remainder.
3120   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3121     BranchInst::Create(LoopScalarPreHeader) :
3122     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3123                        Builder.getTrue());
3124   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3125   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3126 
3127   // Update dominator for loop exit. During skeleton creation, only the vector
3128   // pre-header and the middle block are created. The vector loop is entirely
3129   // created during VPlan exection.
3130   if (!Cost->requiresScalarEpilogue(VF))
3131     // If there is an epilogue which must run, there's no edge from the
3132     // middle block to exit blocks  and thus no need to update the immediate
3133     // dominator of the exit blocks.
3134     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3135 }
3136 
3137 PHINode *InnerLoopVectorizer::createInductionResumeValue(
3138     PHINode *OrigPhi, const InductionDescriptor &II,
3139     ArrayRef<BasicBlock *> BypassBlocks,
3140     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3141   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3142   assert(VectorTripCount && "Expected valid arguments");
3143 
3144   Instruction *OldInduction = Legal->getPrimaryInduction();
3145   Value *&EndValue = IVEndValues[OrigPhi];
3146   Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3147   if (OrigPhi == OldInduction) {
3148     // We know what the end value is.
3149     EndValue = VectorTripCount;
3150   } else {
3151     IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3152 
3153     // Fast-math-flags propagate from the original induction instruction.
3154     if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3155       B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3156 
3157     Value *Step =
3158         CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3159     EndValue =
3160         emitTransformedIndex(B, VectorTripCount, II.getStartValue(), Step, II);
3161     EndValue->setName("ind.end");
3162 
3163     // Compute the end value for the additional bypass (if applicable).
3164     if (AdditionalBypass.first) {
3165       B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3166       Value *Step =
3167           CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3168       EndValueFromAdditionalBypass = emitTransformedIndex(
3169           B, AdditionalBypass.second, II.getStartValue(), Step, II);
3170       EndValueFromAdditionalBypass->setName("ind.end");
3171     }
3172   }
3173 
3174   // Create phi nodes to merge from the  backedge-taken check block.
3175   PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3176                                          LoopScalarPreHeader->getTerminator());
3177   // Copy original phi DL over to the new one.
3178   BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3179 
3180   // The new PHI merges the original incoming value, in case of a bypass,
3181   // or the value at the end of the vectorized loop.
3182   BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3183 
3184   // Fix the scalar body counter (PHI node).
3185   // The old induction's phi node in the scalar body needs the truncated
3186   // value.
3187   for (BasicBlock *BB : BypassBlocks)
3188     BCResumeVal->addIncoming(II.getStartValue(), BB);
3189 
3190   if (AdditionalBypass.first)
3191     BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3192                                           EndValueFromAdditionalBypass);
3193   return BCResumeVal;
3194 }
3195 
3196 void InnerLoopVectorizer::createInductionResumeValues(
3197     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3198   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3199           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3200          "Inconsistent information about additional bypass.");
3201   // We are going to resume the execution of the scalar loop.
3202   // Go over all of the induction variables that we found and fix the
3203   // PHIs that are left in the scalar version of the loop.
3204   // The starting values of PHI nodes depend on the counter of the last
3205   // iteration in the vectorized loop.
3206   // If we come from a bypass edge then we need to start from the original
3207   // start value.
3208   for (const auto &InductionEntry : Legal->getInductionVars()) {
3209     PHINode *OrigPhi = InductionEntry.first;
3210     const InductionDescriptor &II = InductionEntry.second;
3211     PHINode *BCResumeVal = createInductionResumeValue(
3212         OrigPhi, II, LoopBypassBlocks, AdditionalBypass);
3213     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3214   }
3215 }
3216 
3217 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3218   // The trip counts should be cached by now.
3219   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3220   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3221 
3222   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3223 
3224   // Add a check in the middle block to see if we have completed
3225   // all of the iterations in the first vector loop.  Three cases:
3226   // 1) If we require a scalar epilogue, there is no conditional branch as
3227   //    we unconditionally branch to the scalar preheader.  Do nothing.
3228   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3229   //    Thus if tail is to be folded, we know we don't need to run the
3230   //    remainder and we can use the previous value for the condition (true).
3231   // 3) Otherwise, construct a runtime check.
3232   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3233     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3234                                         Count, VectorTripCount, "cmp.n",
3235                                         LoopMiddleBlock->getTerminator());
3236 
3237     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3238     // of the corresponding compare because they may have ended up with
3239     // different line numbers and we want to avoid awkward line stepping while
3240     // debugging. Eg. if the compare has got a line number inside the loop.
3241     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3242     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3243   }
3244 
3245 #ifdef EXPENSIVE_CHECKS
3246   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3247 #endif
3248 
3249   return LoopVectorPreHeader;
3250 }
3251 
3252 std::pair<BasicBlock *, Value *>
3253 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3254   /*
3255    In this function we generate a new loop. The new loop will contain
3256    the vectorized instructions while the old loop will continue to run the
3257    scalar remainder.
3258 
3259        [ ] <-- loop iteration number check.
3260     /   |
3261    /    v
3262   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3263   |  /  |
3264   | /   v
3265   ||   [ ]     <-- vector pre header.
3266   |/    |
3267   |     v
3268   |    [  ] \
3269   |    [  ]_|   <-- vector loop (created during VPlan execution).
3270   |     |
3271   |     v
3272   \   -[ ]   <--- middle-block.
3273    \/   |
3274    /\   v
3275    | ->[ ]     <--- new preheader.
3276    |    |
3277  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3278    |   [ ] \
3279    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3280     \   |
3281      \  v
3282       >[ ]     <-- exit block(s).
3283    ...
3284    */
3285 
3286   // Create an empty vector loop, and prepare basic blocks for the runtime
3287   // checks.
3288   createVectorLoopSkeleton("");
3289 
3290   // Now, compare the new count to zero. If it is zero skip the vector loop and
3291   // jump to the scalar loop. This check also covers the case where the
3292   // backedge-taken count is uint##_max: adding one to it will overflow leading
3293   // to an incorrect trip count of zero. In this (rare) case we will also jump
3294   // to the scalar loop.
3295   emitIterationCountCheck(LoopScalarPreHeader);
3296 
3297   // Generate the code to check any assumptions that we've made for SCEV
3298   // expressions.
3299   emitSCEVChecks(LoopScalarPreHeader);
3300 
3301   // Generate the code that checks in runtime if arrays overlap. We put the
3302   // checks into a separate block to make the more common case of few elements
3303   // faster.
3304   emitMemRuntimeChecks(LoopScalarPreHeader);
3305 
3306   // Emit phis for the new starting index of the scalar loop.
3307   createInductionResumeValues();
3308 
3309   return {completeLoopSkeleton(), nullptr};
3310 }
3311 
3312 // Fix up external users of the induction variable. At this point, we are
3313 // in LCSSA form, with all external PHIs that use the IV having one input value,
3314 // coming from the remainder loop. We need those PHIs to also have a correct
3315 // value for the IV when arriving directly from the middle block.
3316 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3317                                        const InductionDescriptor &II,
3318                                        Value *VectorTripCount, Value *EndValue,
3319                                        BasicBlock *MiddleBlock,
3320                                        BasicBlock *VectorHeader, VPlan &Plan) {
3321   // There are two kinds of external IV usages - those that use the value
3322   // computed in the last iteration (the PHI) and those that use the penultimate
3323   // value (the value that feeds into the phi from the loop latch).
3324   // We allow both, but they, obviously, have different values.
3325 
3326   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3327 
3328   DenseMap<Value *, Value *> MissingVals;
3329 
3330   // An external user of the last iteration's value should see the value that
3331   // the remainder loop uses to initialize its own IV.
3332   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3333   for (User *U : PostInc->users()) {
3334     Instruction *UI = cast<Instruction>(U);
3335     if (!OrigLoop->contains(UI)) {
3336       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3337       MissingVals[UI] = EndValue;
3338     }
3339   }
3340 
3341   // An external user of the penultimate value need to see EndValue - Step.
3342   // The simplest way to get this is to recompute it from the constituent SCEVs,
3343   // that is Start + (Step * (CRD - 1)).
3344   for (User *U : OrigPhi->users()) {
3345     auto *UI = cast<Instruction>(U);
3346     if (!OrigLoop->contains(UI)) {
3347       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3348 
3349       IRBuilder<> B(MiddleBlock->getTerminator());
3350 
3351       // Fast-math-flags propagate from the original induction instruction.
3352       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3353         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3354 
3355       Value *CountMinusOne = B.CreateSub(
3356           VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3357       CountMinusOne->setName("cmo");
3358       Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3359                                     VectorHeader->getTerminator());
3360       Value *Escape =
3361           emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);
3362       Escape->setName("ind.escape");
3363       MissingVals[UI] = Escape;
3364     }
3365   }
3366 
3367   for (auto &I : MissingVals) {
3368     PHINode *PHI = cast<PHINode>(I.first);
3369     // One corner case we have to handle is two IVs "chasing" each-other,
3370     // that is %IV2 = phi [...], [ %IV1, %latch ]
3371     // In this case, if IV1 has an external use, we need to avoid adding both
3372     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3373     // don't already have an incoming value for the middle block.
3374     if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3375       PHI->addIncoming(I.second, MiddleBlock);
3376       Plan.removeLiveOut(PHI);
3377     }
3378   }
3379 }
3380 
3381 namespace {
3382 
3383 struct CSEDenseMapInfo {
3384   static bool canHandle(const Instruction *I) {
3385     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3386            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3387   }
3388 
3389   static inline Instruction *getEmptyKey() {
3390     return DenseMapInfo<Instruction *>::getEmptyKey();
3391   }
3392 
3393   static inline Instruction *getTombstoneKey() {
3394     return DenseMapInfo<Instruction *>::getTombstoneKey();
3395   }
3396 
3397   static unsigned getHashValue(const Instruction *I) {
3398     assert(canHandle(I) && "Unknown instruction!");
3399     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3400                                                            I->value_op_end()));
3401   }
3402 
3403   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3404     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3405         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3406       return LHS == RHS;
3407     return LHS->isIdenticalTo(RHS);
3408   }
3409 };
3410 
3411 } // end anonymous namespace
3412 
3413 ///Perform cse of induction variable instructions.
3414 static void cse(BasicBlock *BB) {
3415   // Perform simple cse.
3416   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3417   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3418     if (!CSEDenseMapInfo::canHandle(&In))
3419       continue;
3420 
3421     // Check if we can replace this instruction with any of the
3422     // visited instructions.
3423     if (Instruction *V = CSEMap.lookup(&In)) {
3424       In.replaceAllUsesWith(V);
3425       In.eraseFromParent();
3426       continue;
3427     }
3428 
3429     CSEMap[&In] = &In;
3430   }
3431 }
3432 
3433 InstructionCost
3434 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3435                                               bool &NeedToScalarize) const {
3436   Function *F = CI->getCalledFunction();
3437   Type *ScalarRetTy = CI->getType();
3438   SmallVector<Type *, 4> Tys, ScalarTys;
3439   for (auto &ArgOp : CI->args())
3440     ScalarTys.push_back(ArgOp->getType());
3441 
3442   // Estimate cost of scalarized vector call. The source operands are assumed
3443   // to be vectors, so we need to extract individual elements from there,
3444   // execute VF scalar calls, and then gather the result into the vector return
3445   // value.
3446   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3447   InstructionCost ScalarCallCost =
3448       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind);
3449   if (VF.isScalar())
3450     return ScalarCallCost;
3451 
3452   // Compute corresponding vector type for return value and arguments.
3453   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3454   for (Type *ScalarTy : ScalarTys)
3455     Tys.push_back(ToVectorTy(ScalarTy, VF));
3456 
3457   // Compute costs of unpacking argument values for the scalar calls and
3458   // packing the return values to a vector.
3459   InstructionCost ScalarizationCost =
3460       getScalarizationOverhead(CI, VF, CostKind);
3461 
3462   InstructionCost Cost =
3463       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3464 
3465   // If we can't emit a vector call for this function, then the currently found
3466   // cost is the cost we need to return.
3467   NeedToScalarize = true;
3468   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3469   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3470 
3471   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3472     return Cost;
3473 
3474   // If the corresponding vector cost is cheaper, return its cost.
3475   InstructionCost VectorCallCost =
3476       TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind);
3477   if (VectorCallCost < Cost) {
3478     NeedToScalarize = false;
3479     Cost = VectorCallCost;
3480   }
3481   return Cost;
3482 }
3483 
3484 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3485   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3486     return Elt;
3487   return VectorType::get(Elt, VF);
3488 }
3489 
3490 InstructionCost
3491 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3492                                                    ElementCount VF) const {
3493   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3494   assert(ID && "Expected intrinsic call!");
3495   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3496   FastMathFlags FMF;
3497   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3498     FMF = FPMO->getFastMathFlags();
3499 
3500   SmallVector<const Value *> Arguments(CI->args());
3501   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3502   SmallVector<Type *> ParamTys;
3503   std::transform(FTy->param_begin(), FTy->param_end(),
3504                  std::back_inserter(ParamTys),
3505                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3506 
3507   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3508                                     dyn_cast<IntrinsicInst>(CI));
3509   return TTI.getIntrinsicInstrCost(CostAttrs,
3510                                    TargetTransformInfo::TCK_RecipThroughput);
3511 }
3512 
3513 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3514   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3515   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3516   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3517 }
3518 
3519 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3520   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3521   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3522   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3523 }
3524 
3525 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3526   // For every instruction `I` in MinBWs, truncate the operands, create a
3527   // truncated version of `I` and reextend its result. InstCombine runs
3528   // later and will remove any ext/trunc pairs.
3529   SmallPtrSet<Value *, 4> Erased;
3530   for (const auto &KV : Cost->getMinimalBitwidths()) {
3531     // If the value wasn't vectorized, we must maintain the original scalar
3532     // type. The absence of the value from State indicates that it
3533     // wasn't vectorized.
3534     // FIXME: Should not rely on getVPValue at this point.
3535     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3536     if (!State.hasAnyVectorValue(Def))
3537       continue;
3538     for (unsigned Part = 0; Part < UF; ++Part) {
3539       Value *I = State.get(Def, Part);
3540       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3541         continue;
3542       Type *OriginalTy = I->getType();
3543       Type *ScalarTruncatedTy =
3544           IntegerType::get(OriginalTy->getContext(), KV.second);
3545       auto *TruncatedTy = VectorType::get(
3546           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3547       if (TruncatedTy == OriginalTy)
3548         continue;
3549 
3550       IRBuilder<> B(cast<Instruction>(I));
3551       auto ShrinkOperand = [&](Value *V) -> Value * {
3552         if (auto *ZI = dyn_cast<ZExtInst>(V))
3553           if (ZI->getSrcTy() == TruncatedTy)
3554             return ZI->getOperand(0);
3555         return B.CreateZExtOrTrunc(V, TruncatedTy);
3556       };
3557 
3558       // The actual instruction modification depends on the instruction type,
3559       // unfortunately.
3560       Value *NewI = nullptr;
3561       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3562         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3563                              ShrinkOperand(BO->getOperand(1)));
3564 
3565         // Any wrapping introduced by shrinking this operation shouldn't be
3566         // considered undefined behavior. So, we can't unconditionally copy
3567         // arithmetic wrapping flags to NewI.
3568         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3569       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3570         NewI =
3571             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3572                          ShrinkOperand(CI->getOperand(1)));
3573       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3574         NewI = B.CreateSelect(SI->getCondition(),
3575                               ShrinkOperand(SI->getTrueValue()),
3576                               ShrinkOperand(SI->getFalseValue()));
3577       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3578         switch (CI->getOpcode()) {
3579         default:
3580           llvm_unreachable("Unhandled cast!");
3581         case Instruction::Trunc:
3582           NewI = ShrinkOperand(CI->getOperand(0));
3583           break;
3584         case Instruction::SExt:
3585           NewI = B.CreateSExtOrTrunc(
3586               CI->getOperand(0),
3587               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3588           break;
3589         case Instruction::ZExt:
3590           NewI = B.CreateZExtOrTrunc(
3591               CI->getOperand(0),
3592               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3593           break;
3594         }
3595       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3596         auto Elements0 =
3597             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3598         auto *O0 = B.CreateZExtOrTrunc(
3599             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3600         auto Elements1 =
3601             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3602         auto *O1 = B.CreateZExtOrTrunc(
3603             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3604 
3605         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3606       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3607         // Don't do anything with the operands, just extend the result.
3608         continue;
3609       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3610         auto Elements =
3611             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3612         auto *O0 = B.CreateZExtOrTrunc(
3613             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3614         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3615         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3616       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3617         auto Elements =
3618             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3619         auto *O0 = B.CreateZExtOrTrunc(
3620             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3621         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3622       } else {
3623         // If we don't know what to do, be conservative and don't do anything.
3624         continue;
3625       }
3626 
3627       // Lastly, extend the result.
3628       NewI->takeName(cast<Instruction>(I));
3629       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3630       I->replaceAllUsesWith(Res);
3631       cast<Instruction>(I)->eraseFromParent();
3632       Erased.insert(I);
3633       State.reset(Def, Res, Part);
3634     }
3635   }
3636 
3637   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3638   for (const auto &KV : Cost->getMinimalBitwidths()) {
3639     // If the value wasn't vectorized, we must maintain the original scalar
3640     // type. The absence of the value from State indicates that it
3641     // wasn't vectorized.
3642     // FIXME: Should not rely on getVPValue at this point.
3643     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3644     if (!State.hasAnyVectorValue(Def))
3645       continue;
3646     for (unsigned Part = 0; Part < UF; ++Part) {
3647       Value *I = State.get(Def, Part);
3648       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3649       if (Inst && Inst->use_empty()) {
3650         Value *NewI = Inst->getOperand(0);
3651         Inst->eraseFromParent();
3652         State.reset(Def, NewI, Part);
3653       }
3654     }
3655   }
3656 }
3657 
3658 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3659                                             VPlan &Plan) {
3660   // Insert truncates and extends for any truncated instructions as hints to
3661   // InstCombine.
3662   if (VF.isVector())
3663     truncateToMinimalBitwidths(State);
3664 
3665   // Fix widened non-induction PHIs by setting up the PHI operands.
3666   if (EnableVPlanNativePath)
3667     fixNonInductionPHIs(Plan, State);
3668 
3669   // At this point every instruction in the original loop is widened to a
3670   // vector form. Now we need to fix the recurrences in the loop. These PHI
3671   // nodes are currently empty because we did not want to introduce cycles.
3672   // This is the second stage of vectorizing recurrences.
3673   fixCrossIterationPHIs(State);
3674 
3675   // Forget the original basic block.
3676   PSE.getSE()->forgetLoop(OrigLoop);
3677 
3678   VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3679   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3680   if (Cost->requiresScalarEpilogue(VF)) {
3681     // No edge from the middle block to the unique exit block has been inserted
3682     // and there is nothing to fix from vector loop; phis should have incoming
3683     // from scalar loop only.
3684     Plan.clearLiveOuts();
3685   } else {
3686     // If we inserted an edge from the middle block to the unique exit block,
3687     // update uses outside the loop (phis) to account for the newly inserted
3688     // edge.
3689 
3690     // Fix-up external users of the induction variables.
3691     for (const auto &Entry : Legal->getInductionVars())
3692       fixupIVUsers(Entry.first, Entry.second,
3693                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3694                    IVEndValues[Entry.first], LoopMiddleBlock,
3695                    VectorLoop->getHeader(), Plan);
3696   }
3697 
3698   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3699   // in the exit block, so update the builder.
3700   State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
3701   for (const auto &KV : Plan.getLiveOuts())
3702     KV.second->fixPhi(Plan, State);
3703 
3704   for (Instruction *PI : PredicatedInstructions)
3705     sinkScalarOperands(&*PI);
3706 
3707   // Remove redundant induction instructions.
3708   cse(VectorLoop->getHeader());
3709 
3710   // Set/update profile weights for the vector and remainder loops as original
3711   // loop iterations are now distributed among them. Note that original loop
3712   // represented by LoopScalarBody becomes remainder loop after vectorization.
3713   //
3714   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3715   // end up getting slightly roughened result but that should be OK since
3716   // profile is not inherently precise anyway. Note also possible bypass of
3717   // vector code caused by legality checks is ignored, assigning all the weight
3718   // to the vector loop, optimistically.
3719   //
3720   // For scalable vectorization we can't know at compile time how many iterations
3721   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3722   // vscale of '1'.
3723   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3724                                LI->getLoopFor(LoopScalarBody),
3725                                VF.getKnownMinValue() * UF);
3726 }
3727 
3728 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3729   // In order to support recurrences we need to be able to vectorize Phi nodes.
3730   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3731   // stage #2: We now need to fix the recurrences by adding incoming edges to
3732   // the currently empty PHI nodes. At this point every instruction in the
3733   // original loop is widened to a vector form so we can use them to construct
3734   // the incoming edges.
3735   VPBasicBlock *Header =
3736       State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3737   for (VPRecipeBase &R : Header->phis()) {
3738     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3739       fixReduction(ReductionPhi, State);
3740     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3741       fixFixedOrderRecurrence(FOR, State);
3742   }
3743 }
3744 
3745 void InnerLoopVectorizer::fixFixedOrderRecurrence(
3746     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3747   // This is the second phase of vectorizing first-order recurrences. An
3748   // overview of the transformation is described below. Suppose we have the
3749   // following loop.
3750   //
3751   //   for (int i = 0; i < n; ++i)
3752   //     b[i] = a[i] - a[i - 1];
3753   //
3754   // There is a first-order recurrence on "a". For this loop, the shorthand
3755   // scalar IR looks like:
3756   //
3757   //   scalar.ph:
3758   //     s_init = a[-1]
3759   //     br scalar.body
3760   //
3761   //   scalar.body:
3762   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3763   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3764   //     s2 = a[i]
3765   //     b[i] = s2 - s1
3766   //     br cond, scalar.body, ...
3767   //
3768   // In this example, s1 is a recurrence because it's value depends on the
3769   // previous iteration. In the first phase of vectorization, we created a
3770   // vector phi v1 for s1. We now complete the vectorization and produce the
3771   // shorthand vector IR shown below (for VF = 4, UF = 1).
3772   //
3773   //   vector.ph:
3774   //     v_init = vector(..., ..., ..., a[-1])
3775   //     br vector.body
3776   //
3777   //   vector.body
3778   //     i = phi [0, vector.ph], [i+4, vector.body]
3779   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3780   //     v2 = a[i, i+1, i+2, i+3];
3781   //     v3 = vector(v1(3), v2(0, 1, 2))
3782   //     b[i, i+1, i+2, i+3] = v2 - v3
3783   //     br cond, vector.body, middle.block
3784   //
3785   //   middle.block:
3786   //     x = v2(3)
3787   //     br scalar.ph
3788   //
3789   //   scalar.ph:
3790   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3791   //     br scalar.body
3792   //
3793   // After execution completes the vector loop, we extract the next value of
3794   // the recurrence (x) to use as the initial value in the scalar loop.
3795 
3796   // Extract the last vector element in the middle block. This will be the
3797   // initial value for the recurrence when jumping to the scalar loop.
3798   VPValue *PreviousDef = PhiR->getBackedgeValue();
3799   Value *Incoming = State.get(PreviousDef, UF - 1);
3800   auto *ExtractForScalar = Incoming;
3801   auto *IdxTy = Builder.getInt32Ty();
3802   if (VF.isVector()) {
3803     auto *One = ConstantInt::get(IdxTy, 1);
3804     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3805     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3806     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3807     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3808                                                     "vector.recur.extract");
3809   }
3810   // Extract the second last element in the middle block if the
3811   // Phi is used outside the loop. We need to extract the phi itself
3812   // and not the last element (the phi update in the current iteration). This
3813   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3814   // when the scalar loop is not run at all.
3815   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3816   if (VF.isVector()) {
3817     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3818     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3819     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3820         Incoming, Idx, "vector.recur.extract.for.phi");
3821   } else if (UF > 1)
3822     // When loop is unrolled without vectorizing, initialize
3823     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3824     // of `Incoming`. This is analogous to the vectorized case above: extracting
3825     // the second last element when VF > 1.
3826     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3827 
3828   // Fix the initial value of the original recurrence in the scalar loop.
3829   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3830   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3831   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3832   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3833   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3834     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3835     Start->addIncoming(Incoming, BB);
3836   }
3837 
3838   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3839   Phi->setName("scalar.recur");
3840 
3841   // Finally, fix users of the recurrence outside the loop. The users will need
3842   // either the last value of the scalar recurrence or the last value of the
3843   // vector recurrence we extracted in the middle block. Since the loop is in
3844   // LCSSA form, we just need to find all the phi nodes for the original scalar
3845   // recurrence in the exit block, and then add an edge for the middle block.
3846   // Note that LCSSA does not imply single entry when the original scalar loop
3847   // had multiple exiting edges (as we always run the last iteration in the
3848   // scalar epilogue); in that case, there is no edge from middle to exit and
3849   // and thus no phis which needed updated.
3850   if (!Cost->requiresScalarEpilogue(VF))
3851     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3852       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
3853         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3854         State.Plan->removeLiveOut(&LCSSAPhi);
3855       }
3856 }
3857 
3858 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3859                                        VPTransformState &State) {
3860   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3861   // Get it's reduction variable descriptor.
3862   assert(Legal->isReductionVariable(OrigPhi) &&
3863          "Unable to find the reduction variable");
3864   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3865 
3866   RecurKind RK = RdxDesc.getRecurrenceKind();
3867   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3868   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3869   State.setDebugLocFromInst(ReductionStartValue);
3870 
3871   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3872   // This is the vector-clone of the value that leaves the loop.
3873   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3874 
3875   // Wrap flags are in general invalid after vectorization, clear them.
3876   clearReductionWrapFlags(PhiR, State);
3877 
3878   // Before each round, move the insertion point right between
3879   // the PHIs and the values we are going to write.
3880   // This allows us to write both PHINodes and the extractelement
3881   // instructions.
3882   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3883 
3884   State.setDebugLocFromInst(LoopExitInst);
3885 
3886   Type *PhiTy = OrigPhi->getType();
3887 
3888   VPBasicBlock *LatchVPBB =
3889       PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3890   BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3891   // If tail is folded by masking, the vector value to leave the loop should be
3892   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3893   // instead of the former. For an inloop reduction the reduction will already
3894   // be predicated, and does not need to be handled here.
3895   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3896     for (unsigned Part = 0; Part < UF; ++Part) {
3897       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3898       SelectInst *Sel = nullptr;
3899       for (User *U : VecLoopExitInst->users()) {
3900         if (isa<SelectInst>(U)) {
3901           assert(!Sel && "Reduction exit feeding two selects");
3902           Sel = cast<SelectInst>(U);
3903         } else
3904           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3905       }
3906       assert(Sel && "Reduction exit feeds no select");
3907       State.reset(LoopExitInstDef, Sel, Part);
3908 
3909       if (isa<FPMathOperator>(Sel))
3910         Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
3911 
3912       // If the target can create a predicated operator for the reduction at no
3913       // extra cost in the loop (for example a predicated vadd), it can be
3914       // cheaper for the select to remain in the loop than be sunk out of it,
3915       // and so use the select value for the phi instead of the old
3916       // LoopExitValue.
3917       if (PreferPredicatedReductionSelect ||
3918           TTI->preferPredicatedReductionSelect(
3919               RdxDesc.getOpcode(), PhiTy,
3920               TargetTransformInfo::ReductionFlags())) {
3921         auto *VecRdxPhi =
3922             cast<PHINode>(State.get(PhiR, Part));
3923         VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3924       }
3925     }
3926   }
3927 
3928   // If the vector reduction can be performed in a smaller type, we truncate
3929   // then extend the loop exit value to enable InstCombine to evaluate the
3930   // entire expression in the smaller type.
3931   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3932     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3933     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3934     Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3935     VectorParts RdxParts(UF);
3936     for (unsigned Part = 0; Part < UF; ++Part) {
3937       RdxParts[Part] = State.get(LoopExitInstDef, Part);
3938       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3939       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3940                                         : Builder.CreateZExt(Trunc, VecTy);
3941       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3942         if (U != Trunc) {
3943           U->replaceUsesOfWith(RdxParts[Part], Extnd);
3944           RdxParts[Part] = Extnd;
3945         }
3946     }
3947     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3948     for (unsigned Part = 0; Part < UF; ++Part) {
3949       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3950       State.reset(LoopExitInstDef, RdxParts[Part], Part);
3951     }
3952   }
3953 
3954   // Reduce all of the unrolled parts into a single vector.
3955   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3956   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3957 
3958   // The middle block terminator has already been assigned a DebugLoc here (the
3959   // OrigLoop's single latch terminator). We want the whole middle block to
3960   // appear to execute on this line because: (a) it is all compiler generated,
3961   // (b) these instructions are always executed after evaluating the latch
3962   // conditional branch, and (c) other passes may add new predecessors which
3963   // terminate on this line. This is the easiest way to ensure we don't
3964   // accidentally cause an extra step back into the loop while debugging.
3965   State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3966   if (PhiR->isOrdered())
3967     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3968   else {
3969     // Floating-point operations should have some FMF to enable the reduction.
3970     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3971     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3972     for (unsigned Part = 1; Part < UF; ++Part) {
3973       Value *RdxPart = State.get(LoopExitInstDef, Part);
3974       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3975         ReducedPartRdx = Builder.CreateBinOp(
3976             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3977       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3978         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3979                                            ReducedPartRdx, RdxPart);
3980       else
3981         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3982     }
3983   }
3984 
3985   // Create the reduction after the loop. Note that inloop reductions create the
3986   // target reduction in the loop using a Reduction recipe.
3987   if (VF.isVector() && !PhiR->isInLoop()) {
3988     ReducedPartRdx =
3989         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3990     // If the reduction can be performed in a smaller type, we need to extend
3991     // the reduction to the wider type before we branch to the original loop.
3992     if (PhiTy != RdxDesc.getRecurrenceType())
3993       ReducedPartRdx = RdxDesc.isSigned()
3994                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3995                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3996   }
3997 
3998   PHINode *ResumePhi =
3999       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
4000 
4001   // Create a phi node that merges control-flow from the backedge-taken check
4002   // block and the middle block.
4003   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4004                                         LoopScalarPreHeader->getTerminator());
4005 
4006   // If we are fixing reductions in the epilogue loop then we should already
4007   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4008   // we carry over the incoming values correctly.
4009   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4010     if (Incoming == LoopMiddleBlock)
4011       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4012     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4013       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4014                               Incoming);
4015     else
4016       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4017   }
4018 
4019   // Set the resume value for this reduction
4020   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4021 
4022   // If there were stores of the reduction value to a uniform memory address
4023   // inside the loop, create the final store here.
4024   if (StoreInst *SI = RdxDesc.IntermediateStore) {
4025     StoreInst *NewSI =
4026         Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
4027     propagateMetadata(NewSI, SI);
4028 
4029     // If the reduction value is used in other places,
4030     // then let the code below create PHI's for that.
4031   }
4032 
4033   // Now, we need to fix the users of the reduction variable
4034   // inside and outside of the scalar remainder loop.
4035 
4036   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4037   // in the exit blocks.  See comment on analogous loop in
4038   // fixFixedOrderRecurrence for a more complete explaination of the logic.
4039   if (!Cost->requiresScalarEpilogue(VF))
4040     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4041       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
4042         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4043         State.Plan->removeLiveOut(&LCSSAPhi);
4044       }
4045 
4046   // Fix the scalar loop reduction variable with the incoming reduction sum
4047   // from the vector body and from the backedge value.
4048   int IncomingEdgeBlockIdx =
4049       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4050   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4051   // Pick the other block.
4052   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4053   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4054   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4055 }
4056 
4057 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
4058                                                   VPTransformState &State) {
4059   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4060   RecurKind RK = RdxDesc.getRecurrenceKind();
4061   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4062     return;
4063 
4064   SmallVector<VPValue *, 8> Worklist;
4065   SmallPtrSet<VPValue *, 8> Visited;
4066   Worklist.push_back(PhiR);
4067   Visited.insert(PhiR);
4068 
4069   while (!Worklist.empty()) {
4070     VPValue *Cur = Worklist.pop_back_val();
4071     for (unsigned Part = 0; Part < UF; ++Part) {
4072       Value *V = State.get(Cur, Part);
4073       if (!isa<OverflowingBinaryOperator>(V))
4074         break;
4075       cast<Instruction>(V)->dropPoisonGeneratingFlags();
4076       }
4077 
4078       for (VPUser *U : Cur->users()) {
4079         auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4080         if (!UserRecipe)
4081           continue;
4082         for (VPValue *V : UserRecipe->definedValues())
4083           if (Visited.insert(V).second)
4084             Worklist.push_back(V);
4085       }
4086   }
4087 }
4088 
4089 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4090   // The basic block and loop containing the predicated instruction.
4091   auto *PredBB = PredInst->getParent();
4092   auto *VectorLoop = LI->getLoopFor(PredBB);
4093 
4094   // Initialize a worklist with the operands of the predicated instruction.
4095   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4096 
4097   // Holds instructions that we need to analyze again. An instruction may be
4098   // reanalyzed if we don't yet know if we can sink it or not.
4099   SmallVector<Instruction *, 8> InstsToReanalyze;
4100 
4101   // Returns true if a given use occurs in the predicated block. Phi nodes use
4102   // their operands in their corresponding predecessor blocks.
4103   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4104     auto *I = cast<Instruction>(U.getUser());
4105     BasicBlock *BB = I->getParent();
4106     if (auto *Phi = dyn_cast<PHINode>(I))
4107       BB = Phi->getIncomingBlock(
4108           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4109     return BB == PredBB;
4110   };
4111 
4112   // Iteratively sink the scalarized operands of the predicated instruction
4113   // into the block we created for it. When an instruction is sunk, it's
4114   // operands are then added to the worklist. The algorithm ends after one pass
4115   // through the worklist doesn't sink a single instruction.
4116   bool Changed;
4117   do {
4118     // Add the instructions that need to be reanalyzed to the worklist, and
4119     // reset the changed indicator.
4120     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4121     InstsToReanalyze.clear();
4122     Changed = false;
4123 
4124     while (!Worklist.empty()) {
4125       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4126 
4127       // We can't sink an instruction if it is a phi node, is not in the loop,
4128       // or may have side effects.
4129       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4130           I->mayHaveSideEffects())
4131         continue;
4132 
4133       // If the instruction is already in PredBB, check if we can sink its
4134       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4135       // sinking the scalar instruction I, hence it appears in PredBB; but it
4136       // may have failed to sink I's operands (recursively), which we try
4137       // (again) here.
4138       if (I->getParent() == PredBB) {
4139         Worklist.insert(I->op_begin(), I->op_end());
4140         continue;
4141       }
4142 
4143       // It's legal to sink the instruction if all its uses occur in the
4144       // predicated block. Otherwise, there's nothing to do yet, and we may
4145       // need to reanalyze the instruction.
4146       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4147         InstsToReanalyze.push_back(I);
4148         continue;
4149       }
4150 
4151       // Move the instruction to the beginning of the predicated block, and add
4152       // it's operands to the worklist.
4153       I->moveBefore(&*PredBB->getFirstInsertionPt());
4154       Worklist.insert(I->op_begin(), I->op_end());
4155 
4156       // The sinking may have enabled other instructions to be sunk, so we will
4157       // need to iterate.
4158       Changed = true;
4159     }
4160   } while (Changed);
4161 }
4162 
4163 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
4164                                               VPTransformState &State) {
4165   auto Iter = vp_depth_first_deep(Plan.getEntry());
4166   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4167     for (VPRecipeBase &P : VPBB->phis()) {
4168       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4169       if (!VPPhi)
4170         continue;
4171       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4172       // Make sure the builder has a valid insert point.
4173       Builder.SetInsertPoint(NewPhi);
4174       for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4175         VPValue *Inc = VPPhi->getIncomingValue(i);
4176         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4177         NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4178       }
4179     }
4180   }
4181 }
4182 
4183 bool InnerLoopVectorizer::useOrderedReductions(
4184     const RecurrenceDescriptor &RdxDesc) {
4185   return Cost->useOrderedReductions(RdxDesc);
4186 }
4187 
4188 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4189   // We should not collect Scalars more than once per VF. Right now, this
4190   // function is called from collectUniformsAndScalars(), which already does
4191   // this check. Collecting Scalars for VF=1 does not make any sense.
4192   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4193          "This function should not be visited twice for the same VF");
4194 
4195   // This avoids any chances of creating a REPLICATE recipe during planning
4196   // since that would result in generation of scalarized code during execution,
4197   // which is not supported for scalable vectors.
4198   if (VF.isScalable()) {
4199     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4200     return;
4201   }
4202 
4203   SmallSetVector<Instruction *, 8> Worklist;
4204 
4205   // These sets are used to seed the analysis with pointers used by memory
4206   // accesses that will remain scalar.
4207   SmallSetVector<Instruction *, 8> ScalarPtrs;
4208   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4209   auto *Latch = TheLoop->getLoopLatch();
4210 
4211   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4212   // The pointer operands of loads and stores will be scalar as long as the
4213   // memory access is not a gather or scatter operation. The value operand of a
4214   // store will remain scalar if the store is scalarized.
4215   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4216     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4217     assert(WideningDecision != CM_Unknown &&
4218            "Widening decision should be ready at this moment");
4219     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4220       if (Ptr == Store->getValueOperand())
4221         return WideningDecision == CM_Scalarize;
4222     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4223            "Ptr is neither a value or pointer operand");
4224     return WideningDecision != CM_GatherScatter;
4225   };
4226 
4227   // A helper that returns true if the given value is a bitcast or
4228   // getelementptr instruction contained in the loop.
4229   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4230     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4231             isa<GetElementPtrInst>(V)) &&
4232            !TheLoop->isLoopInvariant(V);
4233   };
4234 
4235   // A helper that evaluates a memory access's use of a pointer. If the use will
4236   // be a scalar use and the pointer is only used by memory accesses, we place
4237   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4238   // PossibleNonScalarPtrs.
4239   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4240     // We only care about bitcast and getelementptr instructions contained in
4241     // the loop.
4242     if (!isLoopVaryingBitCastOrGEP(Ptr))
4243       return;
4244 
4245     // If the pointer has already been identified as scalar (e.g., if it was
4246     // also identified as uniform), there's nothing to do.
4247     auto *I = cast<Instruction>(Ptr);
4248     if (Worklist.count(I))
4249       return;
4250 
4251     // If the use of the pointer will be a scalar use, and all users of the
4252     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4253     // place the pointer in PossibleNonScalarPtrs.
4254     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4255           return isa<LoadInst>(U) || isa<StoreInst>(U);
4256         }))
4257       ScalarPtrs.insert(I);
4258     else
4259       PossibleNonScalarPtrs.insert(I);
4260   };
4261 
4262   // We seed the scalars analysis with three classes of instructions: (1)
4263   // instructions marked uniform-after-vectorization and (2) bitcast,
4264   // getelementptr and (pointer) phi instructions used by memory accesses
4265   // requiring a scalar use.
4266   //
4267   // (1) Add to the worklist all instructions that have been identified as
4268   // uniform-after-vectorization.
4269   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4270 
4271   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4272   // memory accesses requiring a scalar use. The pointer operands of loads and
4273   // stores will be scalar as long as the memory accesses is not a gather or
4274   // scatter operation. The value operand of a store will remain scalar if the
4275   // store is scalarized.
4276   for (auto *BB : TheLoop->blocks())
4277     for (auto &I : *BB) {
4278       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4279         evaluatePtrUse(Load, Load->getPointerOperand());
4280       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4281         evaluatePtrUse(Store, Store->getPointerOperand());
4282         evaluatePtrUse(Store, Store->getValueOperand());
4283       }
4284     }
4285   for (auto *I : ScalarPtrs)
4286     if (!PossibleNonScalarPtrs.count(I)) {
4287       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4288       Worklist.insert(I);
4289     }
4290 
4291   // Insert the forced scalars.
4292   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
4293   // induction variable when the PHI user is scalarized.
4294   auto ForcedScalar = ForcedScalars.find(VF);
4295   if (ForcedScalar != ForcedScalars.end())
4296     for (auto *I : ForcedScalar->second) {
4297       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
4298       Worklist.insert(I);
4299     }
4300 
4301   // Expand the worklist by looking through any bitcasts and getelementptr
4302   // instructions we've already identified as scalar. This is similar to the
4303   // expansion step in collectLoopUniforms(); however, here we're only
4304   // expanding to include additional bitcasts and getelementptr instructions.
4305   unsigned Idx = 0;
4306   while (Idx != Worklist.size()) {
4307     Instruction *Dst = Worklist[Idx++];
4308     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4309       continue;
4310     auto *Src = cast<Instruction>(Dst->getOperand(0));
4311     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4312           auto *J = cast<Instruction>(U);
4313           return !TheLoop->contains(J) || Worklist.count(J) ||
4314                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4315                   isScalarUse(J, Src));
4316         })) {
4317       Worklist.insert(Src);
4318       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4319     }
4320   }
4321 
4322   // An induction variable will remain scalar if all users of the induction
4323   // variable and induction variable update remain scalar.
4324   for (const auto &Induction : Legal->getInductionVars()) {
4325     auto *Ind = Induction.first;
4326     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4327 
4328     // If tail-folding is applied, the primary induction variable will be used
4329     // to feed a vector compare.
4330     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4331       continue;
4332 
4333     // Returns true if \p Indvar is a pointer induction that is used directly by
4334     // load/store instruction \p I.
4335     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4336                                               Instruction *I) {
4337       return Induction.second.getKind() ==
4338                  InductionDescriptor::IK_PtrInduction &&
4339              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4340              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4341     };
4342 
4343     // Determine if all users of the induction variable are scalar after
4344     // vectorization.
4345     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4346       auto *I = cast<Instruction>(U);
4347       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4348              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4349     });
4350     if (!ScalarInd)
4351       continue;
4352 
4353     // Determine if all users of the induction variable update instruction are
4354     // scalar after vectorization.
4355     auto ScalarIndUpdate =
4356         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4357           auto *I = cast<Instruction>(U);
4358           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4359                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4360         });
4361     if (!ScalarIndUpdate)
4362       continue;
4363 
4364     // The induction variable and its update instruction will remain scalar.
4365     Worklist.insert(Ind);
4366     Worklist.insert(IndUpdate);
4367     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4368     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4369                       << "\n");
4370   }
4371 
4372   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4373 }
4374 
4375 bool LoopVectorizationCostModel::isScalarWithPredication(
4376     Instruction *I, ElementCount VF) const {
4377   if (!isPredicatedInst(I))
4378     return false;
4379 
4380   // Do we have a non-scalar lowering for this predicated
4381   // instruction? No - it is scalar with predication.
4382   switch(I->getOpcode()) {
4383   default:
4384     return true;
4385   case Instruction::Load:
4386   case Instruction::Store: {
4387     auto *Ptr = getLoadStorePointerOperand(I);
4388     auto *Ty = getLoadStoreType(I);
4389     Type *VTy = Ty;
4390     if (VF.isVector())
4391       VTy = VectorType::get(Ty, VF);
4392     const Align Alignment = getLoadStoreAlignment(I);
4393     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4394                                 TTI.isLegalMaskedGather(VTy, Alignment))
4395                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4396                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4397   }
4398   case Instruction::UDiv:
4399   case Instruction::SDiv:
4400   case Instruction::SRem:
4401   case Instruction::URem: {
4402     // We have the option to use the safe-divisor idiom to avoid predication.
4403     // The cost based decision here will always select safe-divisor for
4404     // scalable vectors as scalarization isn't legal.
4405     const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
4406     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
4407   }
4408   }
4409 }
4410 
4411 bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
4412   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4413     return false;
4414 
4415   // Can we prove this instruction is safe to unconditionally execute?
4416   // If not, we must use some form of predication.
4417   switch(I->getOpcode()) {
4418   default:
4419     return false;
4420   case Instruction::Load:
4421   case Instruction::Store: {
4422     if (!Legal->isMaskRequired(I))
4423       return false;
4424     // When we know the load's address is loop invariant and the instruction
4425     // in the original scalar loop was unconditionally executed then we
4426     // don't need to mark it as a predicated instruction. Tail folding may
4427     // introduce additional predication, but we're guaranteed to always have
4428     // at least one active lane.  We call Legal->blockNeedsPredication here
4429     // because it doesn't query tail-folding.  For stores, we need to prove
4430     // both speculation safety (which follows from the same argument as loads),
4431     // but also must prove the value being stored is correct.  The easiest
4432     // form of the later is to require that all values stored are the same.
4433     if (Legal->isUniformMemOp(*I) &&
4434       (isa<LoadInst>(I) ||
4435        (isa<StoreInst>(I) &&
4436         TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()))) &&
4437         !Legal->blockNeedsPredication(I->getParent()))
4438       return false;
4439     return true;
4440   }
4441   case Instruction::UDiv:
4442   case Instruction::SDiv:
4443   case Instruction::SRem:
4444   case Instruction::URem:
4445     // TODO: We can use the loop-preheader as context point here and get
4446     // context sensitive reasoning
4447     return !isSafeToSpeculativelyExecute(I);
4448   }
4449 }
4450 
4451 std::pair<InstructionCost, InstructionCost>
4452 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
4453                                                     ElementCount VF) const {
4454   assert(I->getOpcode() == Instruction::UDiv ||
4455          I->getOpcode() == Instruction::SDiv ||
4456          I->getOpcode() == Instruction::SRem ||
4457          I->getOpcode() == Instruction::URem);
4458   assert(!isSafeToSpeculativelyExecute(I));
4459 
4460   const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4461 
4462   // Scalarization isn't legal for scalable vector types
4463   InstructionCost ScalarizationCost = InstructionCost::getInvalid();
4464   if (!VF.isScalable()) {
4465     // Get the scalarization cost and scale this amount by the probability of
4466     // executing the predicated block. If the instruction is not predicated,
4467     // we fall through to the next case.
4468     ScalarizationCost = 0;
4469 
4470     // These instructions have a non-void type, so account for the phi nodes
4471     // that we will create. This cost is likely to be zero. The phi node
4472     // cost, if any, should be scaled by the block probability because it
4473     // models a copy at the end of each predicated block.
4474     ScalarizationCost += VF.getKnownMinValue() *
4475       TTI.getCFInstrCost(Instruction::PHI, CostKind);
4476 
4477     // The cost of the non-predicated instruction.
4478     ScalarizationCost += VF.getKnownMinValue() *
4479       TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
4480 
4481     // The cost of insertelement and extractelement instructions needed for
4482     // scalarization.
4483     ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4484 
4485     // Scale the cost by the probability of executing the predicated blocks.
4486     // This assumes the predicated block for each vector lane is equally
4487     // likely.
4488     ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4489   }
4490   InstructionCost SafeDivisorCost = 0;
4491 
4492   auto *VecTy = ToVectorTy(I->getType(), VF);
4493 
4494   // The cost of the select guard to ensure all lanes are well defined
4495   // after we speculate above any internal control flow.
4496   SafeDivisorCost += TTI.getCmpSelInstrCost(
4497     Instruction::Select, VecTy,
4498     ToVectorTy(Type::getInt1Ty(I->getContext()), VF),
4499     CmpInst::BAD_ICMP_PREDICATE, CostKind);
4500 
4501   // Certain instructions can be cheaper to vectorize if they have a constant
4502   // second vector operand. One example of this are shifts on x86.
4503   Value *Op2 = I->getOperand(1);
4504   auto Op2Info = TTI.getOperandInfo(Op2);
4505   if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
4506     Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4507 
4508   SmallVector<const Value *, 4> Operands(I->operand_values());
4509   SafeDivisorCost += TTI.getArithmeticInstrCost(
4510     I->getOpcode(), VecTy, CostKind,
4511     {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
4512     Op2Info, Operands, I);
4513   return {ScalarizationCost, SafeDivisorCost};
4514 }
4515 
4516 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4517     Instruction *I, ElementCount VF) {
4518   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4519   assert(getWideningDecision(I, VF) == CM_Unknown &&
4520          "Decision should not be set yet.");
4521   auto *Group = getInterleavedAccessGroup(I);
4522   assert(Group && "Must have a group.");
4523 
4524   // If the instruction's allocated size doesn't equal it's type size, it
4525   // requires padding and will be scalarized.
4526   auto &DL = I->getModule()->getDataLayout();
4527   auto *ScalarTy = getLoadStoreType(I);
4528   if (hasIrregularType(ScalarTy, DL))
4529     return false;
4530 
4531   // If the group involves a non-integral pointer, we may not be able to
4532   // losslessly cast all values to a common type.
4533   unsigned InterleaveFactor = Group->getFactor();
4534   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4535   for (unsigned i = 0; i < InterleaveFactor; i++) {
4536     Instruction *Member = Group->getMember(i);
4537     if (!Member)
4538       continue;
4539     auto *MemberTy = getLoadStoreType(Member);
4540     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4541     // Don't coerce non-integral pointers to integers or vice versa.
4542     if (MemberNI != ScalarNI) {
4543       // TODO: Consider adding special nullptr value case here
4544       return false;
4545     } else if (MemberNI && ScalarNI &&
4546                ScalarTy->getPointerAddressSpace() !=
4547                MemberTy->getPointerAddressSpace()) {
4548       return false;
4549     }
4550   }
4551 
4552   // Check if masking is required.
4553   // A Group may need masking for one of two reasons: it resides in a block that
4554   // needs predication, or it was decided to use masking to deal with gaps
4555   // (either a gap at the end of a load-access that may result in a speculative
4556   // load, or any gaps in a store-access).
4557   bool PredicatedAccessRequiresMasking =
4558       blockNeedsPredicationForAnyReason(I->getParent()) &&
4559       Legal->isMaskRequired(I);
4560   bool LoadAccessWithGapsRequiresEpilogMasking =
4561       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4562       !isScalarEpilogueAllowed();
4563   bool StoreAccessWithGapsRequiresMasking =
4564       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4565   if (!PredicatedAccessRequiresMasking &&
4566       !LoadAccessWithGapsRequiresEpilogMasking &&
4567       !StoreAccessWithGapsRequiresMasking)
4568     return true;
4569 
4570   // If masked interleaving is required, we expect that the user/target had
4571   // enabled it, because otherwise it either wouldn't have been created or
4572   // it should have been invalidated by the CostModel.
4573   assert(useMaskedInterleavedAccesses(TTI) &&
4574          "Masked interleave-groups for predicated accesses are not enabled.");
4575 
4576   if (Group->isReverse())
4577     return false;
4578 
4579   auto *Ty = getLoadStoreType(I);
4580   const Align Alignment = getLoadStoreAlignment(I);
4581   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4582                           : TTI.isLegalMaskedStore(Ty, Alignment);
4583 }
4584 
4585 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4586     Instruction *I, ElementCount VF) {
4587   // Get and ensure we have a valid memory instruction.
4588   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4589 
4590   auto *Ptr = getLoadStorePointerOperand(I);
4591   auto *ScalarTy = getLoadStoreType(I);
4592 
4593   // In order to be widened, the pointer should be consecutive, first of all.
4594   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4595     return false;
4596 
4597   // If the instruction is a store located in a predicated block, it will be
4598   // scalarized.
4599   if (isScalarWithPredication(I, VF))
4600     return false;
4601 
4602   // If the instruction's allocated size doesn't equal it's type size, it
4603   // requires padding and will be scalarized.
4604   auto &DL = I->getModule()->getDataLayout();
4605   if (hasIrregularType(ScalarTy, DL))
4606     return false;
4607 
4608   return true;
4609 }
4610 
4611 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4612   // We should not collect Uniforms more than once per VF. Right now,
4613   // this function is called from collectUniformsAndScalars(), which
4614   // already does this check. Collecting Uniforms for VF=1 does not make any
4615   // sense.
4616 
4617   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4618          "This function should not be visited twice for the same VF");
4619 
4620   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4621   // not analyze again.  Uniforms.count(VF) will return 1.
4622   Uniforms[VF].clear();
4623 
4624   // We now know that the loop is vectorizable!
4625   // Collect instructions inside the loop that will remain uniform after
4626   // vectorization.
4627 
4628   // Global values, params and instructions outside of current loop are out of
4629   // scope.
4630   auto isOutOfScope = [&](Value *V) -> bool {
4631     Instruction *I = dyn_cast<Instruction>(V);
4632     return (!I || !TheLoop->contains(I));
4633   };
4634 
4635   // Worklist containing uniform instructions demanding lane 0.
4636   SetVector<Instruction *> Worklist;
4637   BasicBlock *Latch = TheLoop->getLoopLatch();
4638 
4639   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4640   // that are scalar with predication must not be considered uniform after
4641   // vectorization, because that would create an erroneous replicating region
4642   // where only a single instance out of VF should be formed.
4643   // TODO: optimize such seldom cases if found important, see PR40816.
4644   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4645     if (isOutOfScope(I)) {
4646       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4647                         << *I << "\n");
4648       return;
4649     }
4650     if (isScalarWithPredication(I, VF)) {
4651       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4652                         << *I << "\n");
4653       return;
4654     }
4655     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4656     Worklist.insert(I);
4657   };
4658 
4659   // Start with the conditional branch. If the branch condition is an
4660   // instruction contained in the loop that is only used by the branch, it is
4661   // uniform.
4662   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4663   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4664     addToWorklistIfAllowed(Cmp);
4665 
4666   // Return true if all lanes perform the same memory operation, and we can
4667   // thus chose to execute only one.
4668   auto isUniformMemOpUse = [&](Instruction *I) {
4669     if (!Legal->isUniformMemOp(*I))
4670       return false;
4671     if (isa<LoadInst>(I))
4672       // Loading the same address always produces the same result - at least
4673       // assuming aliasing and ordering which have already been checked.
4674       return true;
4675     // Storing the same value on every iteration.
4676     return TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());
4677   };
4678 
4679   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4680     InstWidening WideningDecision = getWideningDecision(I, VF);
4681     assert(WideningDecision != CM_Unknown &&
4682            "Widening decision should be ready at this moment");
4683 
4684     if (isUniformMemOpUse(I))
4685       return true;
4686 
4687     return (WideningDecision == CM_Widen ||
4688             WideningDecision == CM_Widen_Reverse ||
4689             WideningDecision == CM_Interleave);
4690   };
4691 
4692   // Returns true if Ptr is the pointer operand of a memory access instruction
4693   // I, I is known to not require scalarization, and the pointer is not also
4694   // stored.
4695   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4696     auto GetStoredValue = [I]() -> Value * {
4697       if (!isa<StoreInst>(I))
4698         return nullptr;
4699       return I->getOperand(0);
4700     };
4701     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF) &&
4702            GetStoredValue() != Ptr;
4703   };
4704 
4705   // Holds a list of values which are known to have at least one uniform use.
4706   // Note that there may be other uses which aren't uniform.  A "uniform use"
4707   // here is something which only demands lane 0 of the unrolled iterations;
4708   // it does not imply that all lanes produce the same value (e.g. this is not
4709   // the usual meaning of uniform)
4710   SetVector<Value *> HasUniformUse;
4711 
4712   // Scan the loop for instructions which are either a) known to have only
4713   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4714   for (auto *BB : TheLoop->blocks())
4715     for (auto &I : *BB) {
4716       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4717         switch (II->getIntrinsicID()) {
4718         case Intrinsic::sideeffect:
4719         case Intrinsic::experimental_noalias_scope_decl:
4720         case Intrinsic::assume:
4721         case Intrinsic::lifetime_start:
4722         case Intrinsic::lifetime_end:
4723           if (TheLoop->hasLoopInvariantOperands(&I))
4724             addToWorklistIfAllowed(&I);
4725           break;
4726         default:
4727           break;
4728         }
4729       }
4730 
4731       // ExtractValue instructions must be uniform, because the operands are
4732       // known to be loop-invariant.
4733       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4734         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4735                "Expected aggregate value to be loop invariant");
4736         addToWorklistIfAllowed(EVI);
4737         continue;
4738       }
4739 
4740       // If there's no pointer operand, there's nothing to do.
4741       auto *Ptr = getLoadStorePointerOperand(&I);
4742       if (!Ptr)
4743         continue;
4744 
4745       if (isUniformMemOpUse(&I))
4746         addToWorklistIfAllowed(&I);
4747 
4748       if (isVectorizedMemAccessUse(&I, Ptr)) {
4749         assert(isUniformDecision(&I, VF) && "consistency check");
4750         HasUniformUse.insert(Ptr);
4751       }
4752     }
4753 
4754   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4755   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4756   // disallows uses outside the loop as well.
4757   for (auto *V : HasUniformUse) {
4758     if (isOutOfScope(V))
4759       continue;
4760     auto *I = cast<Instruction>(V);
4761     auto UsersAreMemAccesses =
4762       llvm::all_of(I->users(), [&](User *U) -> bool {
4763         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4764       });
4765     if (UsersAreMemAccesses)
4766       addToWorklistIfAllowed(I);
4767   }
4768 
4769   // Expand Worklist in topological order: whenever a new instruction
4770   // is added , its users should be already inside Worklist.  It ensures
4771   // a uniform instruction will only be used by uniform instructions.
4772   unsigned idx = 0;
4773   while (idx != Worklist.size()) {
4774     Instruction *I = Worklist[idx++];
4775 
4776     for (auto *OV : I->operand_values()) {
4777       // isOutOfScope operands cannot be uniform instructions.
4778       if (isOutOfScope(OV))
4779         continue;
4780       // First order recurrence Phi's should typically be considered
4781       // non-uniform.
4782       auto *OP = dyn_cast<PHINode>(OV);
4783       if (OP && Legal->isFixedOrderRecurrence(OP))
4784         continue;
4785       // If all the users of the operand are uniform, then add the
4786       // operand into the uniform worklist.
4787       auto *OI = cast<Instruction>(OV);
4788       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4789             auto *J = cast<Instruction>(U);
4790             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4791           }))
4792         addToWorklistIfAllowed(OI);
4793     }
4794   }
4795 
4796   // For an instruction to be added into Worklist above, all its users inside
4797   // the loop should also be in Worklist. However, this condition cannot be
4798   // true for phi nodes that form a cyclic dependence. We must process phi
4799   // nodes separately. An induction variable will remain uniform if all users
4800   // of the induction variable and induction variable update remain uniform.
4801   // The code below handles both pointer and non-pointer induction variables.
4802   for (const auto &Induction : Legal->getInductionVars()) {
4803     auto *Ind = Induction.first;
4804     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4805 
4806     // Determine if all users of the induction variable are uniform after
4807     // vectorization.
4808     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4809       auto *I = cast<Instruction>(U);
4810       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4811              isVectorizedMemAccessUse(I, Ind);
4812     });
4813     if (!UniformInd)
4814       continue;
4815 
4816     // Determine if all users of the induction variable update instruction are
4817     // uniform after vectorization.
4818     auto UniformIndUpdate =
4819         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4820           auto *I = cast<Instruction>(U);
4821           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4822                  isVectorizedMemAccessUse(I, IndUpdate);
4823         });
4824     if (!UniformIndUpdate)
4825       continue;
4826 
4827     // The induction variable and its update instruction will remain uniform.
4828     addToWorklistIfAllowed(Ind);
4829     addToWorklistIfAllowed(IndUpdate);
4830   }
4831 
4832   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4833 }
4834 
4835 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4836   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4837 
4838   if (Legal->getRuntimePointerChecking()->Need) {
4839     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4840         "runtime pointer checks needed. Enable vectorization of this "
4841         "loop with '#pragma clang loop vectorize(enable)' when "
4842         "compiling with -Os/-Oz",
4843         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4844     return true;
4845   }
4846 
4847   if (!PSE.getPredicate().isAlwaysTrue()) {
4848     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4849         "runtime SCEV checks needed. Enable vectorization of this "
4850         "loop with '#pragma clang loop vectorize(enable)' when "
4851         "compiling with -Os/-Oz",
4852         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4853     return true;
4854   }
4855 
4856   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4857   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4858     reportVectorizationFailure("Runtime stride check for small trip count",
4859         "runtime stride == 1 checks needed. Enable vectorization of "
4860         "this loop without such check by compiling with -Os/-Oz",
4861         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4862     return true;
4863   }
4864 
4865   return false;
4866 }
4867 
4868 ElementCount
4869 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4870   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4871     return ElementCount::getScalable(0);
4872 
4873   if (Hints->isScalableVectorizationDisabled()) {
4874     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4875                             "ScalableVectorizationDisabled", ORE, TheLoop);
4876     return ElementCount::getScalable(0);
4877   }
4878 
4879   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4880 
4881   auto MaxScalableVF = ElementCount::getScalable(
4882       std::numeric_limits<ElementCount::ScalarTy>::max());
4883 
4884   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4885   // FIXME: While for scalable vectors this is currently sufficient, this should
4886   // be replaced by a more detailed mechanism that filters out specific VFs,
4887   // instead of invalidating vectorization for a whole set of VFs based on the
4888   // MaxVF.
4889 
4890   // Disable scalable vectorization if the loop contains unsupported reductions.
4891   if (!canVectorizeReductions(MaxScalableVF)) {
4892     reportVectorizationInfo(
4893         "Scalable vectorization not supported for the reduction "
4894         "operations found in this loop.",
4895         "ScalableVFUnfeasible", ORE, TheLoop);
4896     return ElementCount::getScalable(0);
4897   }
4898 
4899   // Disable scalable vectorization if the loop contains any instructions
4900   // with element types not supported for scalable vectors.
4901   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4902         return !Ty->isVoidTy() &&
4903                !this->TTI.isElementTypeLegalForScalableVector(Ty);
4904       })) {
4905     reportVectorizationInfo("Scalable vectorization is not supported "
4906                             "for all element types found in this loop.",
4907                             "ScalableVFUnfeasible", ORE, TheLoop);
4908     return ElementCount::getScalable(0);
4909   }
4910 
4911   if (Legal->isSafeForAnyVectorWidth())
4912     return MaxScalableVF;
4913 
4914   // Limit MaxScalableVF by the maximum safe dependence distance.
4915   std::optional<unsigned> MaxVScale = TTI.getMaxVScale();
4916   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4917     MaxVScale =
4918         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4919   MaxScalableVF =
4920       ElementCount::getScalable(MaxVScale ? (MaxSafeElements / *MaxVScale) : 0);
4921   if (!MaxScalableVF)
4922     reportVectorizationInfo(
4923         "Max legal vector width too small, scalable vectorization "
4924         "unfeasible.",
4925         "ScalableVFUnfeasible", ORE, TheLoop);
4926 
4927   return MaxScalableVF;
4928 }
4929 
4930 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4931     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4932   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4933   unsigned SmallestType, WidestType;
4934   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4935 
4936   // Get the maximum safe dependence distance in bits computed by LAA.
4937   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4938   // the memory accesses that is most restrictive (involved in the smallest
4939   // dependence distance).
4940   unsigned MaxSafeElements =
4941       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4942 
4943   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4944   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4945 
4946   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4947                     << ".\n");
4948   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4949                     << ".\n");
4950 
4951   // First analyze the UserVF, fall back if the UserVF should be ignored.
4952   if (UserVF) {
4953     auto MaxSafeUserVF =
4954         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4955 
4956     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4957       // If `VF=vscale x N` is safe, then so is `VF=N`
4958       if (UserVF.isScalable())
4959         return FixedScalableVFPair(
4960             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4961       else
4962         return UserVF;
4963     }
4964 
4965     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4966 
4967     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4968     // is better to ignore the hint and let the compiler choose a suitable VF.
4969     if (!UserVF.isScalable()) {
4970       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4971                         << " is unsafe, clamping to max safe VF="
4972                         << MaxSafeFixedVF << ".\n");
4973       ORE->emit([&]() {
4974         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4975                                           TheLoop->getStartLoc(),
4976                                           TheLoop->getHeader())
4977                << "User-specified vectorization factor "
4978                << ore::NV("UserVectorizationFactor", UserVF)
4979                << " is unsafe, clamping to maximum safe vectorization factor "
4980                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4981       });
4982       return MaxSafeFixedVF;
4983     }
4984 
4985     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4986       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4987                         << " is ignored because scalable vectors are not "
4988                            "available.\n");
4989       ORE->emit([&]() {
4990         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4991                                           TheLoop->getStartLoc(),
4992                                           TheLoop->getHeader())
4993                << "User-specified vectorization factor "
4994                << ore::NV("UserVectorizationFactor", UserVF)
4995                << " is ignored because the target does not support scalable "
4996                   "vectors. The compiler will pick a more suitable value.";
4997       });
4998     } else {
4999       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5000                         << " is unsafe. Ignoring scalable UserVF.\n");
5001       ORE->emit([&]() {
5002         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5003                                           TheLoop->getStartLoc(),
5004                                           TheLoop->getHeader())
5005                << "User-specified vectorization factor "
5006                << ore::NV("UserVectorizationFactor", UserVF)
5007                << " is unsafe. Ignoring the hint to let the compiler pick a "
5008                   "more suitable value.";
5009       });
5010     }
5011   }
5012 
5013   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5014                     << " / " << WidestType << " bits.\n");
5015 
5016   FixedScalableVFPair Result(ElementCount::getFixed(1),
5017                              ElementCount::getScalable(0));
5018   if (auto MaxVF =
5019           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5020                                   MaxSafeFixedVF, FoldTailByMasking))
5021     Result.FixedVF = MaxVF;
5022 
5023   if (auto MaxVF =
5024           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5025                                   MaxSafeScalableVF, FoldTailByMasking))
5026     if (MaxVF.isScalable()) {
5027       Result.ScalableVF = MaxVF;
5028       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5029                         << "\n");
5030     }
5031 
5032   return Result;
5033 }
5034 
5035 FixedScalableVFPair
5036 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5037   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5038     // TODO: It may by useful to do since it's still likely to be dynamically
5039     // uniform if the target can skip.
5040     reportVectorizationFailure(
5041         "Not inserting runtime ptr check for divergent target",
5042         "runtime pointer checks needed. Not enabled for divergent target",
5043         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5044     return FixedScalableVFPair::getNone();
5045   }
5046 
5047   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5048   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5049   if (TC == 1) {
5050     reportVectorizationFailure("Single iteration (non) loop",
5051         "loop trip count is one, irrelevant for vectorization",
5052         "SingleIterationLoop", ORE, TheLoop);
5053     return FixedScalableVFPair::getNone();
5054   }
5055 
5056   switch (ScalarEpilogueStatus) {
5057   case CM_ScalarEpilogueAllowed:
5058     return computeFeasibleMaxVF(TC, UserVF, false);
5059   case CM_ScalarEpilogueNotAllowedUsePredicate:
5060     [[fallthrough]];
5061   case CM_ScalarEpilogueNotNeededUsePredicate:
5062     LLVM_DEBUG(
5063         dbgs() << "LV: vector predicate hint/switch found.\n"
5064                << "LV: Not allowing scalar epilogue, creating predicated "
5065                << "vector loop.\n");
5066     break;
5067   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5068     // fallthrough as a special case of OptForSize
5069   case CM_ScalarEpilogueNotAllowedOptSize:
5070     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5071       LLVM_DEBUG(
5072           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5073     else
5074       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5075                         << "count.\n");
5076 
5077     // Bail if runtime checks are required, which are not good when optimising
5078     // for size.
5079     if (runtimeChecksRequired())
5080       return FixedScalableVFPair::getNone();
5081 
5082     break;
5083   }
5084 
5085   // The only loops we can vectorize without a scalar epilogue, are loops with
5086   // a bottom-test and a single exiting block. We'd have to handle the fact
5087   // that not every instruction executes on the last iteration.  This will
5088   // require a lane mask which varies through the vector loop body.  (TODO)
5089   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5090     // If there was a tail-folding hint/switch, but we can't fold the tail by
5091     // masking, fallback to a vectorization with a scalar epilogue.
5092     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5093       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5094                            "scalar epilogue instead.\n");
5095       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5096       return computeFeasibleMaxVF(TC, UserVF, false);
5097     }
5098     return FixedScalableVFPair::getNone();
5099   }
5100 
5101   // Now try the tail folding
5102 
5103   // Invalidate interleave groups that require an epilogue if we can't mask
5104   // the interleave-group.
5105   if (!useMaskedInterleavedAccesses(TTI)) {
5106     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5107            "No decisions should have been taken at this point");
5108     // Note: There is no need to invalidate any cost modeling decisions here, as
5109     // non where taken so far.
5110     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5111   }
5112 
5113   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5114   // Avoid tail folding if the trip count is known to be a multiple of any VF
5115   // we chose.
5116   // FIXME: The condition below pessimises the case for fixed-width vectors,
5117   // when scalable VFs are also candidates for vectorization.
5118   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5119     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5120     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5121            "MaxFixedVF must be a power of 2");
5122     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5123                                    : MaxFixedVF.getFixedValue();
5124     ScalarEvolution *SE = PSE.getSE();
5125     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5126     const SCEV *ExitCount = SE->getAddExpr(
5127         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5128     const SCEV *Rem = SE->getURemExpr(
5129         SE->applyLoopGuards(ExitCount, TheLoop),
5130         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5131     if (Rem->isZero()) {
5132       // Accept MaxFixedVF if we do not have a tail.
5133       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5134       return MaxFactors;
5135     }
5136   }
5137 
5138   // If we don't know the precise trip count, or if the trip count that we
5139   // found modulo the vectorization factor is not zero, try to fold the tail
5140   // by masking.
5141   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5142   if (Legal->prepareToFoldTailByMasking()) {
5143     FoldTailByMasking = true;
5144     return MaxFactors;
5145   }
5146 
5147   // If there was a tail-folding hint/switch, but we can't fold the tail by
5148   // masking, fallback to a vectorization with a scalar epilogue.
5149   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5150     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5151                          "scalar epilogue instead.\n");
5152     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5153     return MaxFactors;
5154   }
5155 
5156   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5157     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5158     return FixedScalableVFPair::getNone();
5159   }
5160 
5161   if (TC == 0) {
5162     reportVectorizationFailure(
5163         "Unable to calculate the loop count due to complex control flow",
5164         "unable to calculate the loop count due to complex control flow",
5165         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5166     return FixedScalableVFPair::getNone();
5167   }
5168 
5169   reportVectorizationFailure(
5170       "Cannot optimize for size and vectorize at the same time.",
5171       "cannot optimize for size and vectorize at the same time. "
5172       "Enable vectorization of this loop with '#pragma clang loop "
5173       "vectorize(enable)' when compiling with -Os/-Oz",
5174       "NoTailLoopWithOptForSize", ORE, TheLoop);
5175   return FixedScalableVFPair::getNone();
5176 }
5177 
5178 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5179     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5180     ElementCount MaxSafeVF, bool FoldTailByMasking) {
5181   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5182   const TypeSize WidestRegister = TTI.getRegisterBitWidth(
5183       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5184                            : TargetTransformInfo::RGK_FixedWidthVector);
5185 
5186   // Convenience function to return the minimum of two ElementCounts.
5187   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5188     assert((LHS.isScalable() == RHS.isScalable()) &&
5189            "Scalable flags must match");
5190     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5191   };
5192 
5193   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5194   // Note that both WidestRegister and WidestType may not be a powers of 2.
5195   auto MaxVectorElementCount = ElementCount::get(
5196       PowerOf2Floor(WidestRegister.getKnownMinValue() / WidestType),
5197       ComputeScalableMaxVF);
5198   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5199   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5200                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5201 
5202   if (!MaxVectorElementCount) {
5203     LLVM_DEBUG(dbgs() << "LV: The target has no "
5204                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5205                       << " vector registers.\n");
5206     return ElementCount::getFixed(1);
5207   }
5208 
5209   unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
5210   if (MaxVectorElementCount.isScalable() &&
5211       TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5212     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5213     auto Min = Attr.getVScaleRangeMin();
5214     WidestRegisterMinEC *= Min;
5215   }
5216   if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
5217       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5218     // If loop trip count (TC) is known at compile time there is no point in
5219     // choosing VF greater than TC (as done in the loop below). Select maximum
5220     // power of two which doesn't exceed TC.
5221     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5222     // when the TC is less than or equal to the known number of lanes.
5223     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5224     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5225                          "exceeding the constant trip count: "
5226                       << ClampedConstTripCount << "\n");
5227     return ElementCount::getFixed(ClampedConstTripCount);
5228   }
5229 
5230   TargetTransformInfo::RegisterKind RegKind =
5231       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5232                            : TargetTransformInfo::RGK_FixedWidthVector;
5233   ElementCount MaxVF = MaxVectorElementCount;
5234   if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5235                             TTI.shouldMaximizeVectorBandwidth(RegKind))) {
5236     auto MaxVectorElementCountMaxBW = ElementCount::get(
5237         PowerOf2Floor(WidestRegister.getKnownMinValue() / SmallestType),
5238         ComputeScalableMaxVF);
5239     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5240 
5241     // Collect all viable vectorization factors larger than the default MaxVF
5242     // (i.e. MaxVectorElementCount).
5243     SmallVector<ElementCount, 8> VFs;
5244     for (ElementCount VS = MaxVectorElementCount * 2;
5245          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5246       VFs.push_back(VS);
5247 
5248     // For each VF calculate its register usage.
5249     auto RUs = calculateRegisterUsage(VFs);
5250 
5251     // Select the largest VF which doesn't require more registers than existing
5252     // ones.
5253     for (int i = RUs.size() - 1; i >= 0; --i) {
5254       bool Selected = true;
5255       for (auto &pair : RUs[i].MaxLocalUsers) {
5256         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5257         if (pair.second > TargetNumRegisters)
5258           Selected = false;
5259       }
5260       if (Selected) {
5261         MaxVF = VFs[i];
5262         break;
5263       }
5264     }
5265     if (ElementCount MinVF =
5266             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5267       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5268         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5269                           << ") with target's minimum: " << MinVF << '\n');
5270         MaxVF = MinVF;
5271       }
5272     }
5273 
5274     // Invalidate any widening decisions we might have made, in case the loop
5275     // requires prediction (decided later), but we have already made some
5276     // load/store widening decisions.
5277     invalidateCostModelingDecisions();
5278   }
5279   return MaxVF;
5280 }
5281 
5282 std::optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5283   if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5284     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5285     auto Min = Attr.getVScaleRangeMin();
5286     auto Max = Attr.getVScaleRangeMax();
5287     if (Max && Min == Max)
5288       return Max;
5289   }
5290 
5291   return TTI.getVScaleForTuning();
5292 }
5293 
5294 bool LoopVectorizationCostModel::isMoreProfitable(
5295     const VectorizationFactor &A, const VectorizationFactor &B) const {
5296   InstructionCost CostA = A.Cost;
5297   InstructionCost CostB = B.Cost;
5298 
5299   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5300 
5301   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5302       MaxTripCount) {
5303     // If we are folding the tail and the trip count is a known (possibly small)
5304     // constant, the trip count will be rounded up to an integer number of
5305     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5306     // which we compare directly. When not folding the tail, the total cost will
5307     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5308     // approximated with the per-lane cost below instead of using the tripcount
5309     // as here.
5310     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5311     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5312     return RTCostA < RTCostB;
5313   }
5314 
5315   // Improve estimate for the vector width if it is scalable.
5316   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5317   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5318   if (std::optional<unsigned> VScale = getVScaleForTuning()) {
5319     if (A.Width.isScalable())
5320       EstimatedWidthA *= *VScale;
5321     if (B.Width.isScalable())
5322       EstimatedWidthB *= *VScale;
5323   }
5324 
5325   // Assume vscale may be larger than 1 (or the value being tuned for),
5326   // so that scalable vectorization is slightly favorable over fixed-width
5327   // vectorization.
5328   if (A.Width.isScalable() && !B.Width.isScalable())
5329     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5330 
5331   // To avoid the need for FP division:
5332   //      (CostA / A.Width) < (CostB / B.Width)
5333   // <=>  (CostA * B.Width) < (CostB * A.Width)
5334   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5335 }
5336 
5337 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5338     const ElementCountSet &VFCandidates) {
5339   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5340   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5341   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5342   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5343          "Expected Scalar VF to be a candidate");
5344 
5345   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5346                                        ExpectedCost);
5347   VectorizationFactor ChosenFactor = ScalarCost;
5348 
5349   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5350   if (ForceVectorization && VFCandidates.size() > 1) {
5351     // Ignore scalar width, because the user explicitly wants vectorization.
5352     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5353     // evaluation.
5354     ChosenFactor.Cost = InstructionCost::getMax();
5355   }
5356 
5357   SmallVector<InstructionVFPair> InvalidCosts;
5358   for (const auto &i : VFCandidates) {
5359     // The cost for scalar VF=1 is already calculated, so ignore it.
5360     if (i.isScalar())
5361       continue;
5362 
5363     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5364     VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5365 
5366 #ifndef NDEBUG
5367     unsigned AssumedMinimumVscale = 1;
5368     if (std::optional<unsigned> VScale = getVScaleForTuning())
5369       AssumedMinimumVscale = *VScale;
5370     unsigned Width =
5371         Candidate.Width.isScalable()
5372             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5373             : Candidate.Width.getFixedValue();
5374     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5375                       << " costs: " << (Candidate.Cost / Width));
5376     if (i.isScalable())
5377       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5378                         << AssumedMinimumVscale << ")");
5379     LLVM_DEBUG(dbgs() << ".\n");
5380 #endif
5381 
5382     if (!C.second && !ForceVectorization) {
5383       LLVM_DEBUG(
5384           dbgs() << "LV: Not considering vector loop of width " << i
5385                  << " because it will not generate any vector instructions.\n");
5386       continue;
5387     }
5388 
5389     // If profitable add it to ProfitableVF list.
5390     if (isMoreProfitable(Candidate, ScalarCost))
5391       ProfitableVFs.push_back(Candidate);
5392 
5393     if (isMoreProfitable(Candidate, ChosenFactor))
5394       ChosenFactor = Candidate;
5395   }
5396 
5397   // Emit a report of VFs with invalid costs in the loop.
5398   if (!InvalidCosts.empty()) {
5399     // Group the remarks per instruction, keeping the instruction order from
5400     // InvalidCosts.
5401     std::map<Instruction *, unsigned> Numbering;
5402     unsigned I = 0;
5403     for (auto &Pair : InvalidCosts)
5404       if (!Numbering.count(Pair.first))
5405         Numbering[Pair.first] = I++;
5406 
5407     // Sort the list, first on instruction(number) then on VF.
5408     llvm::sort(InvalidCosts,
5409                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5410                  if (Numbering[A.first] != Numbering[B.first])
5411                    return Numbering[A.first] < Numbering[B.first];
5412                  ElementCountComparator ECC;
5413                  return ECC(A.second, B.second);
5414                });
5415 
5416     // For a list of ordered instruction-vf pairs:
5417     //   [(load, vf1), (load, vf2), (store, vf1)]
5418     // Group the instructions together to emit separate remarks for:
5419     //   load  (vf1, vf2)
5420     //   store (vf1)
5421     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5422     auto Subset = ArrayRef<InstructionVFPair>();
5423     do {
5424       if (Subset.empty())
5425         Subset = Tail.take_front(1);
5426 
5427       Instruction *I = Subset.front().first;
5428 
5429       // If the next instruction is different, or if there are no other pairs,
5430       // emit a remark for the collated subset. e.g.
5431       //   [(load, vf1), (load, vf2))]
5432       // to emit:
5433       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5434       if (Subset == Tail || Tail[Subset.size()].first != I) {
5435         std::string OutString;
5436         raw_string_ostream OS(OutString);
5437         assert(!Subset.empty() && "Unexpected empty range");
5438         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5439         for (const auto &Pair : Subset)
5440           OS << (Pair.second == Subset.front().second ? "" : ", ")
5441              << Pair.second;
5442         OS << "):";
5443         if (auto *CI = dyn_cast<CallInst>(I))
5444           OS << " call to " << CI->getCalledFunction()->getName();
5445         else
5446           OS << " " << I->getOpcodeName();
5447         OS.flush();
5448         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5449         Tail = Tail.drop_front(Subset.size());
5450         Subset = {};
5451       } else
5452         // Grow the subset by one element
5453         Subset = Tail.take_front(Subset.size() + 1);
5454     } while (!Tail.empty());
5455   }
5456 
5457   if (!EnableCondStoresVectorization && NumPredStores) {
5458     reportVectorizationFailure("There are conditional stores.",
5459         "store that is conditionally executed prevents vectorization",
5460         "ConditionalStore", ORE, TheLoop);
5461     ChosenFactor = ScalarCost;
5462   }
5463 
5464   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5465                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5466              << "LV: Vectorization seems to be not beneficial, "
5467              << "but was forced by a user.\n");
5468   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5469   return ChosenFactor;
5470 }
5471 
5472 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5473     const Loop &L, ElementCount VF) const {
5474   // Cross iteration phis such as reductions need special handling and are
5475   // currently unsupported.
5476   if (any_of(L.getHeader()->phis(),
5477              [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
5478     return false;
5479 
5480   // Phis with uses outside of the loop require special handling and are
5481   // currently unsupported.
5482   for (const auto &Entry : Legal->getInductionVars()) {
5483     // Look for uses of the value of the induction at the last iteration.
5484     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5485     for (User *U : PostInc->users())
5486       if (!L.contains(cast<Instruction>(U)))
5487         return false;
5488     // Look for uses of penultimate value of the induction.
5489     for (User *U : Entry.first->users())
5490       if (!L.contains(cast<Instruction>(U)))
5491         return false;
5492   }
5493 
5494   // Epilogue vectorization code has not been auditted to ensure it handles
5495   // non-latch exits properly.  It may be fine, but it needs auditted and
5496   // tested.
5497   if (L.getExitingBlock() != L.getLoopLatch())
5498     return false;
5499 
5500   return true;
5501 }
5502 
5503 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5504     const ElementCount VF) const {
5505   // FIXME: We need a much better cost-model to take different parameters such
5506   // as register pressure, code size increase and cost of extra branches into
5507   // account. For now we apply a very crude heuristic and only consider loops
5508   // with vectorization factors larger than a certain value.
5509 
5510   // Allow the target to opt out entirely.
5511   if (!TTI.preferEpilogueVectorization())
5512     return false;
5513 
5514   // We also consider epilogue vectorization unprofitable for targets that don't
5515   // consider interleaving beneficial (eg. MVE).
5516   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5517     return false;
5518   // FIXME: We should consider changing the threshold for scalable
5519   // vectors to take VScaleForTuning into account.
5520   if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5521     return true;
5522   return false;
5523 }
5524 
5525 VectorizationFactor
5526 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5527     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5528   VectorizationFactor Result = VectorizationFactor::Disabled();
5529   if (!EnableEpilogueVectorization) {
5530     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5531     return Result;
5532   }
5533 
5534   if (!isScalarEpilogueAllowed()) {
5535     LLVM_DEBUG(
5536         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5537                   "allowed.\n";);
5538     return Result;
5539   }
5540 
5541   // Not really a cost consideration, but check for unsupported cases here to
5542   // simplify the logic.
5543   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5544     LLVM_DEBUG(
5545         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5546                   "not a supported candidate.\n";);
5547     return Result;
5548   }
5549 
5550   if (EpilogueVectorizationForceVF > 1) {
5551     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5552     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5553     if (LVP.hasPlanWithVF(ForcedEC))
5554       return {ForcedEC, 0, 0};
5555     else {
5556       LLVM_DEBUG(
5557           dbgs()
5558               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5559       return Result;
5560     }
5561   }
5562 
5563   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5564       TheLoop->getHeader()->getParent()->hasMinSize()) {
5565     LLVM_DEBUG(
5566         dbgs()
5567             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5568     return Result;
5569   }
5570 
5571   if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5572     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5573                          "this loop\n");
5574     return Result;
5575   }
5576 
5577   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5578   // the main loop handles 8 lanes per iteration. We could still benefit from
5579   // vectorizing the epilogue loop with VF=4.
5580   ElementCount EstimatedRuntimeVF = MainLoopVF;
5581   if (MainLoopVF.isScalable()) {
5582     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5583     if (std::optional<unsigned> VScale = getVScaleForTuning())
5584       EstimatedRuntimeVF *= *VScale;
5585   }
5586 
5587   for (auto &NextVF : ProfitableVFs)
5588     if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5589           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5590          ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5591         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5592         LVP.hasPlanWithVF(NextVF.Width))
5593       Result = NextVF;
5594 
5595   if (Result != VectorizationFactor::Disabled())
5596     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5597                       << Result.Width << "\n";);
5598   return Result;
5599 }
5600 
5601 std::pair<unsigned, unsigned>
5602 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5603   unsigned MinWidth = -1U;
5604   unsigned MaxWidth = 8;
5605   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5606   // For in-loop reductions, no element types are added to ElementTypesInLoop
5607   // if there are no loads/stores in the loop. In this case, check through the
5608   // reduction variables to determine the maximum width.
5609   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5610     // Reset MaxWidth so that we can find the smallest type used by recurrences
5611     // in the loop.
5612     MaxWidth = -1U;
5613     for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5614       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5615       // When finding the min width used by the recurrence we need to account
5616       // for casts on the input operands of the recurrence.
5617       MaxWidth = std::min<unsigned>(
5618           MaxWidth, std::min<unsigned>(
5619                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5620                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5621     }
5622   } else {
5623     for (Type *T : ElementTypesInLoop) {
5624       MinWidth = std::min<unsigned>(
5625           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5626       MaxWidth = std::max<unsigned>(
5627           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());
5628     }
5629   }
5630   return {MinWidth, MaxWidth};
5631 }
5632 
5633 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5634   ElementTypesInLoop.clear();
5635   // For each block.
5636   for (BasicBlock *BB : TheLoop->blocks()) {
5637     // For each instruction in the loop.
5638     for (Instruction &I : BB->instructionsWithoutDebug()) {
5639       Type *T = I.getType();
5640 
5641       // Skip ignored values.
5642       if (ValuesToIgnore.count(&I))
5643         continue;
5644 
5645       // Only examine Loads, Stores and PHINodes.
5646       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5647         continue;
5648 
5649       // Examine PHI nodes that are reduction variables. Update the type to
5650       // account for the recurrence type.
5651       if (auto *PN = dyn_cast<PHINode>(&I)) {
5652         if (!Legal->isReductionVariable(PN))
5653           continue;
5654         const RecurrenceDescriptor &RdxDesc =
5655             Legal->getReductionVars().find(PN)->second;
5656         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5657             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5658                                       RdxDesc.getRecurrenceType(),
5659                                       TargetTransformInfo::ReductionFlags()))
5660           continue;
5661         T = RdxDesc.getRecurrenceType();
5662       }
5663 
5664       // Examine the stored values.
5665       if (auto *ST = dyn_cast<StoreInst>(&I))
5666         T = ST->getValueOperand()->getType();
5667 
5668       assert(T->isSized() &&
5669              "Expected the load/store/recurrence type to be sized");
5670 
5671       ElementTypesInLoop.insert(T);
5672     }
5673   }
5674 }
5675 
5676 unsigned
5677 LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5678                                                   InstructionCost LoopCost) {
5679   // -- The interleave heuristics --
5680   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5681   // There are many micro-architectural considerations that we can't predict
5682   // at this level. For example, frontend pressure (on decode or fetch) due to
5683   // code size, or the number and capabilities of the execution ports.
5684   //
5685   // We use the following heuristics to select the interleave count:
5686   // 1. If the code has reductions, then we interleave to break the cross
5687   // iteration dependency.
5688   // 2. If the loop is really small, then we interleave to reduce the loop
5689   // overhead.
5690   // 3. We don't interleave if we think that we will spill registers to memory
5691   // due to the increased register pressure.
5692 
5693   if (!isScalarEpilogueAllowed())
5694     return 1;
5695 
5696   // We used the distance for the interleave count.
5697   if (Legal->getMaxSafeDepDistBytes() != -1U)
5698     return 1;
5699 
5700   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5701   const bool HasReductions = !Legal->getReductionVars().empty();
5702   // Do not interleave loops with a relatively small known or estimated trip
5703   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5704   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5705   // because with the above conditions interleaving can expose ILP and break
5706   // cross iteration dependences for reductions.
5707   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5708       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5709     return 1;
5710 
5711   // If we did not calculate the cost for VF (because the user selected the VF)
5712   // then we calculate the cost of VF here.
5713   if (LoopCost == 0) {
5714     LoopCost = expectedCost(VF).first;
5715     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5716 
5717     // Loop body is free and there is no need for interleaving.
5718     if (LoopCost == 0)
5719       return 1;
5720   }
5721 
5722   RegisterUsage R = calculateRegisterUsage({VF})[0];
5723   // We divide by these constants so assume that we have at least one
5724   // instruction that uses at least one register.
5725   for (auto& pair : R.MaxLocalUsers) {
5726     pair.second = std::max(pair.second, 1U);
5727   }
5728 
5729   // We calculate the interleave count using the following formula.
5730   // Subtract the number of loop invariants from the number of available
5731   // registers. These registers are used by all of the interleaved instances.
5732   // Next, divide the remaining registers by the number of registers that is
5733   // required by the loop, in order to estimate how many parallel instances
5734   // fit without causing spills. All of this is rounded down if necessary to be
5735   // a power of two. We want power of two interleave count to simplify any
5736   // addressing operations or alignment considerations.
5737   // We also want power of two interleave counts to ensure that the induction
5738   // variable of the vector loop wraps to zero, when tail is folded by masking;
5739   // this currently happens when OptForSize, in which case IC is set to 1 above.
5740   unsigned IC = UINT_MAX;
5741 
5742   for (auto& pair : R.MaxLocalUsers) {
5743     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5744     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5745                       << " registers of "
5746                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5747     if (VF.isScalar()) {
5748       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5749         TargetNumRegisters = ForceTargetNumScalarRegs;
5750     } else {
5751       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5752         TargetNumRegisters = ForceTargetNumVectorRegs;
5753     }
5754     unsigned MaxLocalUsers = pair.second;
5755     unsigned LoopInvariantRegs = 0;
5756     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5757       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5758 
5759     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5760     // Don't count the induction variable as interleaved.
5761     if (EnableIndVarRegisterHeur) {
5762       TmpIC =
5763           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5764                         std::max(1U, (MaxLocalUsers - 1)));
5765     }
5766 
5767     IC = std::min(IC, TmpIC);
5768   }
5769 
5770   // Clamp the interleave ranges to reasonable counts.
5771   unsigned MaxInterleaveCount =
5772       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5773 
5774   // Check if the user has overridden the max.
5775   if (VF.isScalar()) {
5776     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5777       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5778   } else {
5779     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5780       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5781   }
5782 
5783   // If trip count is known or estimated compile time constant, limit the
5784   // interleave count to be less than the trip count divided by VF, provided it
5785   // is at least 1.
5786   //
5787   // For scalable vectors we can't know if interleaving is beneficial. It may
5788   // not be beneficial for small loops if none of the lanes in the second vector
5789   // iterations is enabled. However, for larger loops, there is likely to be a
5790   // similar benefit as for fixed-width vectors. For now, we choose to leave
5791   // the InterleaveCount as if vscale is '1', although if some information about
5792   // the vector is known (e.g. min vector size), we can make a better decision.
5793   if (BestKnownTC) {
5794     MaxInterleaveCount =
5795         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5796     // Make sure MaxInterleaveCount is greater than 0.
5797     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5798   }
5799 
5800   assert(MaxInterleaveCount > 0 &&
5801          "Maximum interleave count must be greater than 0");
5802 
5803   // Clamp the calculated IC to be between the 1 and the max interleave count
5804   // that the target and trip count allows.
5805   if (IC > MaxInterleaveCount)
5806     IC = MaxInterleaveCount;
5807   else
5808     // Make sure IC is greater than 0.
5809     IC = std::max(1u, IC);
5810 
5811   assert(IC > 0 && "Interleave count must be greater than 0.");
5812 
5813   // Interleave if we vectorized this loop and there is a reduction that could
5814   // benefit from interleaving.
5815   if (VF.isVector() && HasReductions) {
5816     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5817     return IC;
5818   }
5819 
5820   // For any scalar loop that either requires runtime checks or predication we
5821   // are better off leaving this to the unroller. Note that if we've already
5822   // vectorized the loop we will have done the runtime check and so interleaving
5823   // won't require further checks.
5824   bool ScalarInterleavingRequiresPredication =
5825       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5826          return Legal->blockNeedsPredication(BB);
5827        }));
5828   bool ScalarInterleavingRequiresRuntimePointerCheck =
5829       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5830 
5831   // We want to interleave small loops in order to reduce the loop overhead and
5832   // potentially expose ILP opportunities.
5833   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5834                     << "LV: IC is " << IC << '\n'
5835                     << "LV: VF is " << VF << '\n');
5836   const bool AggressivelyInterleaveReductions =
5837       TTI.enableAggressiveInterleaving(HasReductions);
5838   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5839       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5840     // We assume that the cost overhead is 1 and we use the cost model
5841     // to estimate the cost of the loop and interleave until the cost of the
5842     // loop overhead is about 5% of the cost of the loop.
5843     unsigned SmallIC = std::min(
5844         IC, (unsigned)PowerOf2Floor(SmallLoopCost / *LoopCost.getValue()));
5845 
5846     // Interleave until store/load ports (estimated by max interleave count) are
5847     // saturated.
5848     unsigned NumStores = Legal->getNumStores();
5849     unsigned NumLoads = Legal->getNumLoads();
5850     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5851     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5852 
5853     // There is little point in interleaving for reductions containing selects
5854     // and compares when VF=1 since it may just create more overhead than it's
5855     // worth for loops with small trip counts. This is because we still have to
5856     // do the final reduction after the loop.
5857     bool HasSelectCmpReductions =
5858         HasReductions &&
5859         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5860           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5861           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5862               RdxDesc.getRecurrenceKind());
5863         });
5864     if (HasSelectCmpReductions) {
5865       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5866       return 1;
5867     }
5868 
5869     // If we have a scalar reduction (vector reductions are already dealt with
5870     // by this point), we can increase the critical path length if the loop
5871     // we're interleaving is inside another loop. For tree-wise reductions
5872     // set the limit to 2, and for ordered reductions it's best to disable
5873     // interleaving entirely.
5874     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5875       bool HasOrderedReductions =
5876           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5877             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5878             return RdxDesc.isOrdered();
5879           });
5880       if (HasOrderedReductions) {
5881         LLVM_DEBUG(
5882             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5883         return 1;
5884       }
5885 
5886       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5887       SmallIC = std::min(SmallIC, F);
5888       StoresIC = std::min(StoresIC, F);
5889       LoadsIC = std::min(LoadsIC, F);
5890     }
5891 
5892     if (EnableLoadStoreRuntimeInterleave &&
5893         std::max(StoresIC, LoadsIC) > SmallIC) {
5894       LLVM_DEBUG(
5895           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5896       return std::max(StoresIC, LoadsIC);
5897     }
5898 
5899     // If there are scalar reductions and TTI has enabled aggressive
5900     // interleaving for reductions, we will interleave to expose ILP.
5901     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5902         AggressivelyInterleaveReductions) {
5903       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5904       // Interleave no less than SmallIC but not as aggressive as the normal IC
5905       // to satisfy the rare situation when resources are too limited.
5906       return std::max(IC / 2, SmallIC);
5907     } else {
5908       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5909       return SmallIC;
5910     }
5911   }
5912 
5913   // Interleave if this is a large loop (small loops are already dealt with by
5914   // this point) that could benefit from interleaving.
5915   if (AggressivelyInterleaveReductions) {
5916     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5917     return IC;
5918   }
5919 
5920   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5921   return 1;
5922 }
5923 
5924 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5925 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5926   // This function calculates the register usage by measuring the highest number
5927   // of values that are alive at a single location. Obviously, this is a very
5928   // rough estimation. We scan the loop in a topological order in order and
5929   // assign a number to each instruction. We use RPO to ensure that defs are
5930   // met before their users. We assume that each instruction that has in-loop
5931   // users starts an interval. We record every time that an in-loop value is
5932   // used, so we have a list of the first and last occurrences of each
5933   // instruction. Next, we transpose this data structure into a multi map that
5934   // holds the list of intervals that *end* at a specific location. This multi
5935   // map allows us to perform a linear search. We scan the instructions linearly
5936   // and record each time that a new interval starts, by placing it in a set.
5937   // If we find this value in the multi-map then we remove it from the set.
5938   // The max register usage is the maximum size of the set.
5939   // We also search for instructions that are defined outside the loop, but are
5940   // used inside the loop. We need this number separately from the max-interval
5941   // usage number because when we unroll, loop-invariant values do not take
5942   // more register.
5943   LoopBlocksDFS DFS(TheLoop);
5944   DFS.perform(LI);
5945 
5946   RegisterUsage RU;
5947 
5948   // Each 'key' in the map opens a new interval. The values
5949   // of the map are the index of the 'last seen' usage of the
5950   // instruction that is the key.
5951   using IntervalMap = DenseMap<Instruction *, unsigned>;
5952 
5953   // Maps instruction to its index.
5954   SmallVector<Instruction *, 64> IdxToInstr;
5955   // Marks the end of each interval.
5956   IntervalMap EndPoint;
5957   // Saves the list of instruction indices that are used in the loop.
5958   SmallPtrSet<Instruction *, 8> Ends;
5959   // Saves the list of values that are used in the loop but are defined outside
5960   // the loop (not including non-instruction values such as arguments and
5961   // constants).
5962   SmallPtrSet<Value *, 8> LoopInvariants;
5963 
5964   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5965     for (Instruction &I : BB->instructionsWithoutDebug()) {
5966       IdxToInstr.push_back(&I);
5967 
5968       // Save the end location of each USE.
5969       for (Value *U : I.operands()) {
5970         auto *Instr = dyn_cast<Instruction>(U);
5971 
5972         // Ignore non-instruction values such as arguments, constants, etc.
5973         // FIXME: Might need some motivation why these values are ignored. If
5974         // for example an argument is used inside the loop it will increase the
5975         // register pressure (so shouldn't we add it to LoopInvariants).
5976         if (!Instr)
5977           continue;
5978 
5979         // If this instruction is outside the loop then record it and continue.
5980         if (!TheLoop->contains(Instr)) {
5981           LoopInvariants.insert(Instr);
5982           continue;
5983         }
5984 
5985         // Overwrite previous end points.
5986         EndPoint[Instr] = IdxToInstr.size();
5987         Ends.insert(Instr);
5988       }
5989     }
5990   }
5991 
5992   // Saves the list of intervals that end with the index in 'key'.
5993   using InstrList = SmallVector<Instruction *, 2>;
5994   DenseMap<unsigned, InstrList> TransposeEnds;
5995 
5996   // Transpose the EndPoints to a list of values that end at each index.
5997   for (auto &Interval : EndPoint)
5998     TransposeEnds[Interval.second].push_back(Interval.first);
5999 
6000   SmallPtrSet<Instruction *, 8> OpenIntervals;
6001   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6002   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6003 
6004   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6005 
6006   const auto &TTICapture = TTI;
6007   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6008     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6009       return 0;
6010     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
6011   };
6012 
6013   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6014     Instruction *I = IdxToInstr[i];
6015 
6016     // Remove all of the instructions that end at this location.
6017     InstrList &List = TransposeEnds[i];
6018     for (Instruction *ToRemove : List)
6019       OpenIntervals.erase(ToRemove);
6020 
6021     // Ignore instructions that are never used within the loop.
6022     if (!Ends.count(I))
6023       continue;
6024 
6025     // Skip ignored values.
6026     if (ValuesToIgnore.count(I))
6027       continue;
6028 
6029     // For each VF find the maximum usage of registers.
6030     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6031       // Count the number of registers used, per register class, given all open
6032       // intervals.
6033       // Note that elements in this SmallMapVector will be default constructed
6034       // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
6035       // there is no previous entry for ClassID.
6036       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6037 
6038       if (VFs[j].isScalar()) {
6039         for (auto *Inst : OpenIntervals) {
6040           unsigned ClassID =
6041               TTI.getRegisterClassForType(false, Inst->getType());
6042           // FIXME: The target might use more than one register for the type
6043           // even in the scalar case.
6044           RegUsage[ClassID] += 1;
6045         }
6046       } else {
6047         collectUniformsAndScalars(VFs[j]);
6048         for (auto *Inst : OpenIntervals) {
6049           // Skip ignored values for VF > 1.
6050           if (VecValuesToIgnore.count(Inst))
6051             continue;
6052           if (isScalarAfterVectorization(Inst, VFs[j])) {
6053             unsigned ClassID =
6054                 TTI.getRegisterClassForType(false, Inst->getType());
6055             // FIXME: The target might use more than one register for the type
6056             // even in the scalar case.
6057             RegUsage[ClassID] += 1;
6058           } else {
6059             unsigned ClassID =
6060                 TTI.getRegisterClassForType(true, Inst->getType());
6061             RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6062           }
6063         }
6064       }
6065 
6066       for (auto& pair : RegUsage) {
6067         auto &Entry = MaxUsages[j][pair.first];
6068         Entry = std::max(Entry, pair.second);
6069       }
6070     }
6071 
6072     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6073                       << OpenIntervals.size() << '\n');
6074 
6075     // Add the current instruction to the list of open intervals.
6076     OpenIntervals.insert(I);
6077   }
6078 
6079   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6080     // Note that elements in this SmallMapVector will be default constructed
6081     // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
6082     // there is no previous entry for ClassID.
6083     SmallMapVector<unsigned, unsigned, 4> Invariant;
6084 
6085     for (auto *Inst : LoopInvariants) {
6086       // FIXME: The target might use more than one register for the type
6087       // even in the scalar case.
6088       unsigned Usage =
6089           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6090       unsigned ClassID =
6091           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6092       Invariant[ClassID] += Usage;
6093     }
6094 
6095     LLVM_DEBUG({
6096       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6097       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6098              << " item\n";
6099       for (const auto &pair : MaxUsages[i]) {
6100         dbgs() << "LV(REG): RegisterClass: "
6101                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6102                << " registers\n";
6103       }
6104       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6105              << " item\n";
6106       for (const auto &pair : Invariant) {
6107         dbgs() << "LV(REG): RegisterClass: "
6108                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6109                << " registers\n";
6110       }
6111     });
6112 
6113     RU.LoopInvariantRegs = Invariant;
6114     RU.MaxLocalUsers = MaxUsages[i];
6115     RUs[i] = RU;
6116   }
6117 
6118   return RUs;
6119 }
6120 
6121 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6122                                                            ElementCount VF) {
6123   // TODO: Cost model for emulated masked load/store is completely
6124   // broken. This hack guides the cost model to use an artificially
6125   // high enough value to practically disable vectorization with such
6126   // operations, except where previously deployed legality hack allowed
6127   // using very low cost values. This is to avoid regressions coming simply
6128   // from moving "masked load/store" check from legality to cost model.
6129   // Masked Load/Gather emulation was previously never allowed.
6130   // Limited number of Masked Store/Scatter emulation was allowed.
6131   assert((isPredicatedInst(I)) &&
6132          "Expecting a scalar emulated instruction");
6133   return isa<LoadInst>(I) ||
6134          (isa<StoreInst>(I) &&
6135           NumPredStores > NumberOfStoresToPredicate);
6136 }
6137 
6138 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6139   // If we aren't vectorizing the loop, or if we've already collected the
6140   // instructions to scalarize, there's nothing to do. Collection may already
6141   // have occurred if we have a user-selected VF and are now computing the
6142   // expected cost for interleaving.
6143   if (VF.isScalar() || VF.isZero() ||
6144       InstsToScalarize.find(VF) != InstsToScalarize.end())
6145     return;
6146 
6147   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6148   // not profitable to scalarize any instructions, the presence of VF in the
6149   // map will indicate that we've analyzed it already.
6150   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6151 
6152   PredicatedBBsAfterVectorization[VF].clear();
6153 
6154   // Find all the instructions that are scalar with predication in the loop and
6155   // determine if it would be better to not if-convert the blocks they are in.
6156   // If so, we also record the instructions to scalarize.
6157   for (BasicBlock *BB : TheLoop->blocks()) {
6158     if (!blockNeedsPredicationForAnyReason(BB))
6159       continue;
6160     for (Instruction &I : *BB)
6161       if (isScalarWithPredication(&I, VF)) {
6162         ScalarCostsTy ScalarCosts;
6163         // Do not apply discount if scalable, because that would lead to
6164         // invalid scalarization costs.
6165         // Do not apply discount logic if hacked cost is needed
6166         // for emulated masked memrefs.
6167         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6168             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6169           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6170         // Remember that BB will remain after vectorization.
6171         PredicatedBBsAfterVectorization[VF].insert(BB);
6172       }
6173   }
6174 }
6175 
6176 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
6177     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6178   assert(!isUniformAfterVectorization(PredInst, VF) &&
6179          "Instruction marked uniform-after-vectorization will be predicated");
6180 
6181   // Initialize the discount to zero, meaning that the scalar version and the
6182   // vector version cost the same.
6183   InstructionCost Discount = 0;
6184 
6185   // Holds instructions to analyze. The instructions we visit are mapped in
6186   // ScalarCosts. Those instructions are the ones that would be scalarized if
6187   // we find that the scalar version costs less.
6188   SmallVector<Instruction *, 8> Worklist;
6189 
6190   // Returns true if the given instruction can be scalarized.
6191   auto canBeScalarized = [&](Instruction *I) -> bool {
6192     // We only attempt to scalarize instructions forming a single-use chain
6193     // from the original predicated block that would otherwise be vectorized.
6194     // Although not strictly necessary, we give up on instructions we know will
6195     // already be scalar to avoid traversing chains that are unlikely to be
6196     // beneficial.
6197     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6198         isScalarAfterVectorization(I, VF))
6199       return false;
6200 
6201     // If the instruction is scalar with predication, it will be analyzed
6202     // separately. We ignore it within the context of PredInst.
6203     if (isScalarWithPredication(I, VF))
6204       return false;
6205 
6206     // If any of the instruction's operands are uniform after vectorization,
6207     // the instruction cannot be scalarized. This prevents, for example, a
6208     // masked load from being scalarized.
6209     //
6210     // We assume we will only emit a value for lane zero of an instruction
6211     // marked uniform after vectorization, rather than VF identical values.
6212     // Thus, if we scalarize an instruction that uses a uniform, we would
6213     // create uses of values corresponding to the lanes we aren't emitting code
6214     // for. This behavior can be changed by allowing getScalarValue to clone
6215     // the lane zero values for uniforms rather than asserting.
6216     for (Use &U : I->operands())
6217       if (auto *J = dyn_cast<Instruction>(U.get()))
6218         if (isUniformAfterVectorization(J, VF))
6219           return false;
6220 
6221     // Otherwise, we can scalarize the instruction.
6222     return true;
6223   };
6224 
6225   // Compute the expected cost discount from scalarizing the entire expression
6226   // feeding the predicated instruction. We currently only consider expressions
6227   // that are single-use instruction chains.
6228   Worklist.push_back(PredInst);
6229   while (!Worklist.empty()) {
6230     Instruction *I = Worklist.pop_back_val();
6231 
6232     // If we've already analyzed the instruction, there's nothing to do.
6233     if (ScalarCosts.find(I) != ScalarCosts.end())
6234       continue;
6235 
6236     // Compute the cost of the vector instruction. Note that this cost already
6237     // includes the scalarization overhead of the predicated instruction.
6238     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6239 
6240     // Compute the cost of the scalarized instruction. This cost is the cost of
6241     // the instruction as if it wasn't if-converted and instead remained in the
6242     // predicated block. We will scale this cost by block probability after
6243     // computing the scalarization overhead.
6244     InstructionCost ScalarCost =
6245         VF.getFixedValue() *
6246         getInstructionCost(I, ElementCount::getFixed(1)).first;
6247 
6248     // Compute the scalarization overhead of needed insertelement instructions
6249     // and phi nodes.
6250     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6251     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6252       ScalarCost += TTI.getScalarizationOverhead(
6253           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6254           APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true,
6255           /*Extract*/ false, CostKind);
6256       ScalarCost +=
6257           VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
6258     }
6259 
6260     // Compute the scalarization overhead of needed extractelement
6261     // instructions. For each of the instruction's operands, if the operand can
6262     // be scalarized, add it to the worklist; otherwise, account for the
6263     // overhead.
6264     for (Use &U : I->operands())
6265       if (auto *J = dyn_cast<Instruction>(U.get())) {
6266         assert(VectorType::isValidElementType(J->getType()) &&
6267                "Instruction has non-scalar type");
6268         if (canBeScalarized(J))
6269           Worklist.push_back(J);
6270         else if (needsExtract(J, VF)) {
6271           ScalarCost += TTI.getScalarizationOverhead(
6272               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6273               APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false,
6274               /*Extract*/ true, CostKind);
6275         }
6276       }
6277 
6278     // Scale the total scalar cost by block probability.
6279     ScalarCost /= getReciprocalPredBlockProb();
6280 
6281     // Compute the discount. A non-negative discount means the vector version
6282     // of the instruction costs more, and scalarizing would be beneficial.
6283     Discount += VectorCost - ScalarCost;
6284     ScalarCosts[I] = ScalarCost;
6285   }
6286 
6287   return Discount;
6288 }
6289 
6290 LoopVectorizationCostModel::VectorizationCostTy
6291 LoopVectorizationCostModel::expectedCost(
6292     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6293   VectorizationCostTy Cost;
6294 
6295   // For each block.
6296   for (BasicBlock *BB : TheLoop->blocks()) {
6297     VectorizationCostTy BlockCost;
6298 
6299     // For each instruction in the old loop.
6300     for (Instruction &I : BB->instructionsWithoutDebug()) {
6301       // Skip ignored values.
6302       if (ValuesToIgnore.count(&I) ||
6303           (VF.isVector() && VecValuesToIgnore.count(&I)))
6304         continue;
6305 
6306       VectorizationCostTy C = getInstructionCost(&I, VF);
6307 
6308       // Check if we should override the cost.
6309       if (C.first.isValid() &&
6310           ForceTargetInstructionCost.getNumOccurrences() > 0)
6311         C.first = InstructionCost(ForceTargetInstructionCost);
6312 
6313       // Keep a list of instructions with invalid costs.
6314       if (Invalid && !C.first.isValid())
6315         Invalid->emplace_back(&I, VF);
6316 
6317       BlockCost.first += C.first;
6318       BlockCost.second |= C.second;
6319       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6320                         << " for VF " << VF << " For instruction: " << I
6321                         << '\n');
6322     }
6323 
6324     // If we are vectorizing a predicated block, it will have been
6325     // if-converted. This means that the block's instructions (aside from
6326     // stores and instructions that may divide by zero) will now be
6327     // unconditionally executed. For the scalar case, we may not always execute
6328     // the predicated block, if it is an if-else block. Thus, scale the block's
6329     // cost by the probability of executing it. blockNeedsPredication from
6330     // Legal is used so as to not include all blocks in tail folded loops.
6331     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6332       BlockCost.first /= getReciprocalPredBlockProb();
6333 
6334     Cost.first += BlockCost.first;
6335     Cost.second |= BlockCost.second;
6336   }
6337 
6338   return Cost;
6339 }
6340 
6341 /// Gets Address Access SCEV after verifying that the access pattern
6342 /// is loop invariant except the induction variable dependence.
6343 ///
6344 /// This SCEV can be sent to the Target in order to estimate the address
6345 /// calculation cost.
6346 static const SCEV *getAddressAccessSCEV(
6347               Value *Ptr,
6348               LoopVectorizationLegality *Legal,
6349               PredicatedScalarEvolution &PSE,
6350               const Loop *TheLoop) {
6351 
6352   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6353   if (!Gep)
6354     return nullptr;
6355 
6356   // We are looking for a gep with all loop invariant indices except for one
6357   // which should be an induction variable.
6358   auto SE = PSE.getSE();
6359   unsigned NumOperands = Gep->getNumOperands();
6360   for (unsigned i = 1; i < NumOperands; ++i) {
6361     Value *Opd = Gep->getOperand(i);
6362     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6363         !Legal->isInductionVariable(Opd))
6364       return nullptr;
6365   }
6366 
6367   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6368   return PSE.getSCEV(Ptr);
6369 }
6370 
6371 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6372   return Legal->hasStride(I->getOperand(0)) ||
6373          Legal->hasStride(I->getOperand(1));
6374 }
6375 
6376 InstructionCost
6377 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6378                                                         ElementCount VF) {
6379   assert(VF.isVector() &&
6380          "Scalarization cost of instruction implies vectorization.");
6381   if (VF.isScalable())
6382     return InstructionCost::getInvalid();
6383 
6384   Type *ValTy = getLoadStoreType(I);
6385   auto SE = PSE.getSE();
6386 
6387   unsigned AS = getLoadStoreAddressSpace(I);
6388   Value *Ptr = getLoadStorePointerOperand(I);
6389   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6390   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6391   //       that it is being called from this specific place.
6392 
6393   // Figure out whether the access is strided and get the stride value
6394   // if it's known in compile time
6395   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6396 
6397   // Get the cost of the scalar memory instruction and address computation.
6398   InstructionCost Cost =
6399       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6400 
6401   // Don't pass *I here, since it is scalar but will actually be part of a
6402   // vectorized loop where the user of it is a vectorized instruction.
6403   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6404   const Align Alignment = getLoadStoreAlignment(I);
6405   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
6406                                                       ValTy->getScalarType(),
6407                                                       Alignment, AS, CostKind);
6408 
6409   // Get the overhead of the extractelement and insertelement instructions
6410   // we might create due to scalarization.
6411   Cost += getScalarizationOverhead(I, VF, CostKind);
6412 
6413   // If we have a predicated load/store, it will need extra i1 extracts and
6414   // conditional branches, but may not be executed for each vector lane. Scale
6415   // the cost by the probability of executing the predicated block.
6416   if (isPredicatedInst(I)) {
6417     Cost /= getReciprocalPredBlockProb();
6418 
6419     // Add the cost of an i1 extract and a branch
6420     auto *Vec_i1Ty =
6421         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6422     Cost += TTI.getScalarizationOverhead(
6423         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6424         /*Insert=*/false, /*Extract=*/true, CostKind);
6425     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
6426 
6427     if (useEmulatedMaskMemRefHack(I, VF))
6428       // Artificially setting to a high enough value to practically disable
6429       // vectorization with such operations.
6430       Cost = 3000000;
6431   }
6432 
6433   return Cost;
6434 }
6435 
6436 InstructionCost
6437 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6438                                                     ElementCount VF) {
6439   Type *ValTy = getLoadStoreType(I);
6440   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6441   Value *Ptr = getLoadStorePointerOperand(I);
6442   unsigned AS = getLoadStoreAddressSpace(I);
6443   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6444   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6445 
6446   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6447          "Stride should be 1 or -1 for consecutive memory access");
6448   const Align Alignment = getLoadStoreAlignment(I);
6449   InstructionCost Cost = 0;
6450   if (Legal->isMaskRequired(I)) {
6451     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6452                                       CostKind);
6453   } else {
6454     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6455     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6456                                 CostKind, OpInfo, I);
6457   }
6458 
6459   bool Reverse = ConsecutiveStride < 0;
6460   if (Reverse)
6461     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6462                                std::nullopt, CostKind, 0);
6463   return Cost;
6464 }
6465 
6466 InstructionCost
6467 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6468                                                 ElementCount VF) {
6469   assert(Legal->isUniformMemOp(*I));
6470 
6471   Type *ValTy = getLoadStoreType(I);
6472   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6473   const Align Alignment = getLoadStoreAlignment(I);
6474   unsigned AS = getLoadStoreAddressSpace(I);
6475   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6476   if (isa<LoadInst>(I)) {
6477     return TTI.getAddressComputationCost(ValTy) +
6478            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6479                                CostKind) +
6480            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6481   }
6482   StoreInst *SI = cast<StoreInst>(I);
6483 
6484   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6485   return TTI.getAddressComputationCost(ValTy) +
6486          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6487                              CostKind) +
6488          (isLoopInvariantStoreValue
6489               ? 0
6490               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6491                                        CostKind, VF.getKnownMinValue() - 1));
6492 }
6493 
6494 InstructionCost
6495 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6496                                                  ElementCount VF) {
6497   Type *ValTy = getLoadStoreType(I);
6498   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6499   const Align Alignment = getLoadStoreAlignment(I);
6500   const Value *Ptr = getLoadStorePointerOperand(I);
6501 
6502   return TTI.getAddressComputationCost(VectorTy) +
6503          TTI.getGatherScatterOpCost(
6504              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6505              TargetTransformInfo::TCK_RecipThroughput, I);
6506 }
6507 
6508 InstructionCost
6509 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6510                                                    ElementCount VF) {
6511   // TODO: Once we have support for interleaving with scalable vectors
6512   // we can calculate the cost properly here.
6513   if (VF.isScalable())
6514     return InstructionCost::getInvalid();
6515 
6516   Type *ValTy = getLoadStoreType(I);
6517   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6518   unsigned AS = getLoadStoreAddressSpace(I);
6519   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6520 
6521   auto Group = getInterleavedAccessGroup(I);
6522   assert(Group && "Fail to get an interleaved access group.");
6523 
6524   unsigned InterleaveFactor = Group->getFactor();
6525   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6526 
6527   // Holds the indices of existing members in the interleaved group.
6528   SmallVector<unsigned, 4> Indices;
6529   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6530     if (Group->getMember(IF))
6531       Indices.push_back(IF);
6532 
6533   // Calculate the cost of the whole interleaved group.
6534   bool UseMaskForGaps =
6535       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6536       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6537   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6538       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6539       AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
6540 
6541   if (Group->isReverse()) {
6542     // TODO: Add support for reversed masked interleaved access.
6543     assert(!Legal->isMaskRequired(I) &&
6544            "Reverse masked interleaved access not supported.");
6545     Cost += Group->getNumMembers() *
6546             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
6547                                std::nullopt, CostKind, 0);
6548   }
6549   return Cost;
6550 }
6551 
6552 std::optional<InstructionCost>
6553 LoopVectorizationCostModel::getReductionPatternCost(
6554     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6555   using namespace llvm::PatternMatch;
6556   // Early exit for no inloop reductions
6557   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6558     return std::nullopt;
6559   auto *VectorTy = cast<VectorType>(Ty);
6560 
6561   // We are looking for a pattern of, and finding the minimal acceptable cost:
6562   //  reduce(mul(ext(A), ext(B))) or
6563   //  reduce(mul(A, B)) or
6564   //  reduce(ext(A)) or
6565   //  reduce(A).
6566   // The basic idea is that we walk down the tree to do that, finding the root
6567   // reduction instruction in InLoopReductionImmediateChains. From there we find
6568   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6569   // of the components. If the reduction cost is lower then we return it for the
6570   // reduction instruction and 0 for the other instructions in the pattern. If
6571   // it is not we return an invalid cost specifying the orignal cost method
6572   // should be used.
6573   Instruction *RetI = I;
6574   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6575     if (!RetI->hasOneUser())
6576       return std::nullopt;
6577     RetI = RetI->user_back();
6578   }
6579 
6580   if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
6581       RetI->user_back()->getOpcode() == Instruction::Add) {
6582     RetI = RetI->user_back();
6583   }
6584 
6585   // Test if the found instruction is a reduction, and if not return an invalid
6586   // cost specifying the parent to use the original cost modelling.
6587   if (!InLoopReductionImmediateChains.count(RetI))
6588     return std::nullopt;
6589 
6590   // Find the reduction this chain is a part of and calculate the basic cost of
6591   // the reduction on its own.
6592   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6593   Instruction *ReductionPhi = LastChain;
6594   while (!isa<PHINode>(ReductionPhi))
6595     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6596 
6597   const RecurrenceDescriptor &RdxDesc =
6598       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6599 
6600   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6601       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6602 
6603   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6604   // normal fmul instruction to the cost of the fadd reduction.
6605   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6606     BaseCost +=
6607         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6608 
6609   // If we're using ordered reductions then we can just return the base cost
6610   // here, since getArithmeticReductionCost calculates the full ordered
6611   // reduction cost when FP reassociation is not allowed.
6612   if (useOrderedReductions(RdxDesc))
6613     return BaseCost;
6614 
6615   // Get the operand that was not the reduction chain and match it to one of the
6616   // patterns, returning the better cost if it is found.
6617   Instruction *RedOp = RetI->getOperand(1) == LastChain
6618                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6619                            : dyn_cast<Instruction>(RetI->getOperand(1));
6620 
6621   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6622 
6623   Instruction *Op0, *Op1;
6624   if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6625       match(RedOp,
6626             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6627       match(Op0, m_ZExtOrSExt(m_Value())) &&
6628       Op0->getOpcode() == Op1->getOpcode() &&
6629       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6630       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6631       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6632 
6633     // Matched reduce.add(ext(mul(ext(A), ext(B)))
6634     // Note that the extend opcodes need to all match, or if A==B they will have
6635     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6636     // which is equally fine.
6637     bool IsUnsigned = isa<ZExtInst>(Op0);
6638     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6639     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6640 
6641     InstructionCost ExtCost =
6642         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6643                              TTI::CastContextHint::None, CostKind, Op0);
6644     InstructionCost MulCost =
6645         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6646     InstructionCost Ext2Cost =
6647         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6648                              TTI::CastContextHint::None, CostKind, RedOp);
6649 
6650     InstructionCost RedCost = TTI.getMulAccReductionCost(
6651         IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6652 
6653     if (RedCost.isValid() &&
6654         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6655       return I == RetI ? RedCost : 0;
6656   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6657              !TheLoop->isLoopInvariant(RedOp)) {
6658     // Matched reduce(ext(A))
6659     bool IsUnsigned = isa<ZExtInst>(RedOp);
6660     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6661     InstructionCost RedCost = TTI.getExtendedReductionCost(
6662         RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6663         RdxDesc.getFastMathFlags(), CostKind);
6664 
6665     InstructionCost ExtCost =
6666         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6667                              TTI::CastContextHint::None, CostKind, RedOp);
6668     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6669       return I == RetI ? RedCost : 0;
6670   } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6671              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6672     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6673         Op0->getOpcode() == Op1->getOpcode() &&
6674         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6675       bool IsUnsigned = isa<ZExtInst>(Op0);
6676       Type *Op0Ty = Op0->getOperand(0)->getType();
6677       Type *Op1Ty = Op1->getOperand(0)->getType();
6678       Type *LargestOpTy =
6679           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6680                                                                     : Op0Ty;
6681       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6682 
6683       // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6684       // different sizes. We take the largest type as the ext to reduce, and add
6685       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6686       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6687           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6688           TTI::CastContextHint::None, CostKind, Op0);
6689       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6690           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6691           TTI::CastContextHint::None, CostKind, Op1);
6692       InstructionCost MulCost =
6693           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6694 
6695       InstructionCost RedCost = TTI.getMulAccReductionCost(
6696           IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind);
6697       InstructionCost ExtraExtCost = 0;
6698       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6699         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6700         ExtraExtCost = TTI.getCastInstrCost(
6701             ExtraExtOp->getOpcode(), ExtType,
6702             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6703             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6704       }
6705 
6706       if (RedCost.isValid() &&
6707           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6708         return I == RetI ? RedCost : 0;
6709     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6710       // Matched reduce.add(mul())
6711       InstructionCost MulCost =
6712           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6713 
6714       InstructionCost RedCost = TTI.getMulAccReductionCost(
6715           true, RdxDesc.getRecurrenceType(), VectorTy, CostKind);
6716 
6717       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6718         return I == RetI ? RedCost : 0;
6719     }
6720   }
6721 
6722   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6723 }
6724 
6725 InstructionCost
6726 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6727                                                      ElementCount VF) {
6728   // Calculate scalar cost only. Vectorization cost should be ready at this
6729   // moment.
6730   if (VF.isScalar()) {
6731     Type *ValTy = getLoadStoreType(I);
6732     const Align Alignment = getLoadStoreAlignment(I);
6733     unsigned AS = getLoadStoreAddressSpace(I);
6734 
6735     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
6736     return TTI.getAddressComputationCost(ValTy) +
6737            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6738                                TTI::TCK_RecipThroughput, OpInfo, I);
6739   }
6740   return getWideningCost(I, VF);
6741 }
6742 
6743 LoopVectorizationCostModel::VectorizationCostTy
6744 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6745                                                ElementCount VF) {
6746   // If we know that this instruction will remain uniform, check the cost of
6747   // the scalar version.
6748   if (isUniformAfterVectorization(I, VF))
6749     VF = ElementCount::getFixed(1);
6750 
6751   if (VF.isVector() && isProfitableToScalarize(I, VF))
6752     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6753 
6754   // Forced scalars do not have any scalarization overhead.
6755   auto ForcedScalar = ForcedScalars.find(VF);
6756   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6757     auto InstSet = ForcedScalar->second;
6758     if (InstSet.count(I))
6759       return VectorizationCostTy(
6760           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6761            VF.getKnownMinValue()),
6762           false);
6763   }
6764 
6765   Type *VectorTy;
6766   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6767 
6768   bool TypeNotScalarized = false;
6769   if (VF.isVector() && VectorTy->isVectorTy()) {
6770     if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6771       if (VF.isScalable())
6772         // <vscale x 1 x iN> is assumed to be profitable over iN because
6773         // scalable registers are a distinct register class from scalar ones.
6774         // If we ever find a target which wants to lower scalable vectors
6775         // back to scalars, we'll need to update this code to explicitly
6776         // ask TTI about the register class uses for each part.
6777         TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6778       else
6779         TypeNotScalarized = NumParts < VF.getKnownMinValue();
6780     } else
6781       C = InstructionCost::getInvalid();
6782   }
6783   return VectorizationCostTy(C, TypeNotScalarized);
6784 }
6785 
6786 InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6787     Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6788 
6789   // There is no mechanism yet to create a scalable scalarization loop,
6790   // so this is currently Invalid.
6791   if (VF.isScalable())
6792     return InstructionCost::getInvalid();
6793 
6794   if (VF.isScalar())
6795     return 0;
6796 
6797   InstructionCost Cost = 0;
6798   Type *RetTy = ToVectorTy(I->getType(), VF);
6799   if (!RetTy->isVoidTy() &&
6800       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6801     Cost += TTI.getScalarizationOverhead(
6802         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()),
6803         /*Insert*/ true,
6804         /*Extract*/ false, CostKind);
6805 
6806   // Some targets keep addresses scalar.
6807   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6808     return Cost;
6809 
6810   // Some targets support efficient element stores.
6811   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6812     return Cost;
6813 
6814   // Collect operands to consider.
6815   CallInst *CI = dyn_cast<CallInst>(I);
6816   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6817 
6818   // Skip operands that do not require extraction/scalarization and do not incur
6819   // any overhead.
6820   SmallVector<Type *> Tys;
6821   for (auto *V : filterExtractingOperands(Ops, VF))
6822     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6823   return Cost + TTI.getOperandsScalarizationOverhead(
6824                     filterExtractingOperands(Ops, VF), Tys, CostKind);
6825 }
6826 
6827 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6828   if (VF.isScalar())
6829     return;
6830   NumPredStores = 0;
6831   for (BasicBlock *BB : TheLoop->blocks()) {
6832     // For each instruction in the old loop.
6833     for (Instruction &I : *BB) {
6834       Value *Ptr =  getLoadStorePointerOperand(&I);
6835       if (!Ptr)
6836         continue;
6837 
6838       // TODO: We should generate better code and update the cost model for
6839       // predicated uniform stores. Today they are treated as any other
6840       // predicated store (see added test cases in
6841       // invariant-store-vectorization.ll).
6842       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6843         NumPredStores++;
6844 
6845       if (Legal->isUniformMemOp(I)) {
6846         auto isLegalToScalarize = [&]() {
6847           if (!VF.isScalable())
6848             // Scalarization of fixed length vectors "just works".
6849             return true;
6850 
6851           // We have dedicated lowering for unpredicated uniform loads and
6852           // stores.  Note that even with tail folding we know that at least
6853           // one lane is active (i.e. generalized predication is not possible
6854           // here), and the logic below depends on this fact.
6855           if (!foldTailByMasking())
6856             return true;
6857 
6858           // For scalable vectors, a uniform memop load is always
6859           // uniform-by-parts  and we know how to scalarize that.
6860           if (isa<LoadInst>(I))
6861             return true;
6862 
6863           // A uniform store isn't neccessarily uniform-by-part
6864           // and we can't assume scalarization.
6865           auto &SI = cast<StoreInst>(I);
6866           return TheLoop->isLoopInvariant(SI.getValueOperand());
6867         };
6868 
6869         const InstructionCost GatherScatterCost =
6870           isLegalGatherOrScatter(&I, VF) ?
6871           getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
6872 
6873         // Load: Scalar load + broadcast
6874         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6875         // FIXME: This cost is a significant under-estimate for tail folded
6876         // memory ops.
6877         const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6878           getUniformMemOpCost(&I, VF) : InstructionCost::getInvalid();
6879 
6880         // Choose better solution for the current VF,  Note that Invalid
6881         // costs compare as maximumal large.  If both are invalid, we get
6882         // scalable invalid which signals a failure and a vectorization abort.
6883         if (GatherScatterCost < ScalarizationCost)
6884           setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
6885         else
6886           setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
6887         continue;
6888       }
6889 
6890       // We assume that widening is the best solution when possible.
6891       if (memoryInstructionCanBeWidened(&I, VF)) {
6892         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6893         int ConsecutiveStride = Legal->isConsecutivePtr(
6894             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6895         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6896                "Expected consecutive stride.");
6897         InstWidening Decision =
6898             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6899         setWideningDecision(&I, VF, Decision, Cost);
6900         continue;
6901       }
6902 
6903       // Choose between Interleaving, Gather/Scatter or Scalarization.
6904       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6905       unsigned NumAccesses = 1;
6906       if (isAccessInterleaved(&I)) {
6907         auto Group = getInterleavedAccessGroup(&I);
6908         assert(Group && "Fail to get an interleaved access group.");
6909 
6910         // Make one decision for the whole group.
6911         if (getWideningDecision(&I, VF) != CM_Unknown)
6912           continue;
6913 
6914         NumAccesses = Group->getNumMembers();
6915         if (interleavedAccessCanBeWidened(&I, VF))
6916           InterleaveCost = getInterleaveGroupCost(&I, VF);
6917       }
6918 
6919       InstructionCost GatherScatterCost =
6920           isLegalGatherOrScatter(&I, VF)
6921               ? getGatherScatterCost(&I, VF) * NumAccesses
6922               : InstructionCost::getInvalid();
6923 
6924       InstructionCost ScalarizationCost =
6925           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6926 
6927       // Choose better solution for the current VF,
6928       // write down this decision and use it during vectorization.
6929       InstructionCost Cost;
6930       InstWidening Decision;
6931       if (InterleaveCost <= GatherScatterCost &&
6932           InterleaveCost < ScalarizationCost) {
6933         Decision = CM_Interleave;
6934         Cost = InterleaveCost;
6935       } else if (GatherScatterCost < ScalarizationCost) {
6936         Decision = CM_GatherScatter;
6937         Cost = GatherScatterCost;
6938       } else {
6939         Decision = CM_Scalarize;
6940         Cost = ScalarizationCost;
6941       }
6942       // If the instructions belongs to an interleave group, the whole group
6943       // receives the same decision. The whole group receives the cost, but
6944       // the cost will actually be assigned to one instruction.
6945       if (auto Group = getInterleavedAccessGroup(&I))
6946         setWideningDecision(Group, VF, Decision, Cost);
6947       else
6948         setWideningDecision(&I, VF, Decision, Cost);
6949     }
6950   }
6951 
6952   // Make sure that any load of address and any other address computation
6953   // remains scalar unless there is gather/scatter support. This avoids
6954   // inevitable extracts into address registers, and also has the benefit of
6955   // activating LSR more, since that pass can't optimize vectorized
6956   // addresses.
6957   if (TTI.prefersVectorizedAddressing())
6958     return;
6959 
6960   // Start with all scalar pointer uses.
6961   SmallPtrSet<Instruction *, 8> AddrDefs;
6962   for (BasicBlock *BB : TheLoop->blocks())
6963     for (Instruction &I : *BB) {
6964       Instruction *PtrDef =
6965         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6966       if (PtrDef && TheLoop->contains(PtrDef) &&
6967           getWideningDecision(&I, VF) != CM_GatherScatter)
6968         AddrDefs.insert(PtrDef);
6969     }
6970 
6971   // Add all instructions used to generate the addresses.
6972   SmallVector<Instruction *, 4> Worklist;
6973   append_range(Worklist, AddrDefs);
6974   while (!Worklist.empty()) {
6975     Instruction *I = Worklist.pop_back_val();
6976     for (auto &Op : I->operands())
6977       if (auto *InstOp = dyn_cast<Instruction>(Op))
6978         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6979             AddrDefs.insert(InstOp).second)
6980           Worklist.push_back(InstOp);
6981   }
6982 
6983   for (auto *I : AddrDefs) {
6984     if (isa<LoadInst>(I)) {
6985       // Setting the desired widening decision should ideally be handled in
6986       // by cost functions, but since this involves the task of finding out
6987       // if the loaded register is involved in an address computation, it is
6988       // instead changed here when we know this is the case.
6989       InstWidening Decision = getWideningDecision(I, VF);
6990       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6991         // Scalarize a widened load of address.
6992         setWideningDecision(
6993             I, VF, CM_Scalarize,
6994             (VF.getKnownMinValue() *
6995              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6996       else if (auto Group = getInterleavedAccessGroup(I)) {
6997         // Scalarize an interleave group of address loads.
6998         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6999           if (Instruction *Member = Group->getMember(I))
7000             setWideningDecision(
7001                 Member, VF, CM_Scalarize,
7002                 (VF.getKnownMinValue() *
7003                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7004         }
7005       }
7006     } else
7007       // Make sure I gets scalarized and a cost estimate without
7008       // scalarization overhead.
7009       ForcedScalars[VF].insert(I);
7010   }
7011 }
7012 
7013 InstructionCost
7014 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7015                                                Type *&VectorTy) {
7016   Type *RetTy = I->getType();
7017   if (canTruncateToMinimalBitwidth(I, VF))
7018     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7019   auto SE = PSE.getSE();
7020   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7021 
7022   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7023                                                 ElementCount VF) -> bool {
7024     if (VF.isScalar())
7025       return true;
7026 
7027     auto Scalarized = InstsToScalarize.find(VF);
7028     assert(Scalarized != InstsToScalarize.end() &&
7029            "VF not yet analyzed for scalarization profitability");
7030     return !Scalarized->second.count(I) &&
7031            llvm::all_of(I->users(), [&](User *U) {
7032              auto *UI = cast<Instruction>(U);
7033              return !Scalarized->second.count(UI);
7034            });
7035   };
7036   (void) hasSingleCopyAfterVectorization;
7037 
7038   if (isScalarAfterVectorization(I, VF)) {
7039     // With the exception of GEPs and PHIs, after scalarization there should
7040     // only be one copy of the instruction generated in the loop. This is
7041     // because the VF is either 1, or any instructions that need scalarizing
7042     // have already been dealt with by the the time we get here. As a result,
7043     // it means we don't have to multiply the instruction cost by VF.
7044     assert(I->getOpcode() == Instruction::GetElementPtr ||
7045            I->getOpcode() == Instruction::PHI ||
7046            (I->getOpcode() == Instruction::BitCast &&
7047             I->getType()->isPointerTy()) ||
7048            hasSingleCopyAfterVectorization(I, VF));
7049     VectorTy = RetTy;
7050   } else
7051     VectorTy = ToVectorTy(RetTy, VF);
7052 
7053   // TODO: We need to estimate the cost of intrinsic calls.
7054   switch (I->getOpcode()) {
7055   case Instruction::GetElementPtr:
7056     // We mark this instruction as zero-cost because the cost of GEPs in
7057     // vectorized code depends on whether the corresponding memory instruction
7058     // is scalarized or not. Therefore, we handle GEPs with the memory
7059     // instruction cost.
7060     return 0;
7061   case Instruction::Br: {
7062     // In cases of scalarized and predicated instructions, there will be VF
7063     // predicated blocks in the vectorized loop. Each branch around these
7064     // blocks requires also an extract of its vector compare i1 element.
7065     bool ScalarPredicatedBB = false;
7066     BranchInst *BI = cast<BranchInst>(I);
7067     if (VF.isVector() && BI->isConditional() &&
7068         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
7069          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
7070       ScalarPredicatedBB = true;
7071 
7072     if (ScalarPredicatedBB) {
7073       // Not possible to scalarize scalable vector with predicated instructions.
7074       if (VF.isScalable())
7075         return InstructionCost::getInvalid();
7076       // Return cost for branches around scalarized and predicated blocks.
7077       auto *Vec_i1Ty =
7078           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7079       return (
7080           TTI.getScalarizationOverhead(
7081               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
7082               /*Insert*/ false, /*Extract*/ true, CostKind) +
7083           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7084     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7085       // The back-edge branch will remain, as will all scalar branches.
7086       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7087     else
7088       // This branch will be eliminated by if-conversion.
7089       return 0;
7090     // Note: We currently assume zero cost for an unconditional branch inside
7091     // a predicated block since it will become a fall-through, although we
7092     // may decide in the future to call TTI for all branches.
7093   }
7094   case Instruction::PHI: {
7095     auto *Phi = cast<PHINode>(I);
7096 
7097     // First-order recurrences are replaced by vector shuffles inside the loop.
7098     if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
7099       SmallVector<int> Mask(VF.getKnownMinValue());
7100       std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);
7101       return TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
7102                                 cast<VectorType>(VectorTy), Mask, CostKind,
7103                                 VF.getKnownMinValue() - 1);
7104     }
7105 
7106     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7107     // converted into select instructions. We require N - 1 selects per phi
7108     // node, where N is the number of incoming values.
7109     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7110       return (Phi->getNumIncomingValues() - 1) *
7111              TTI.getCmpSelInstrCost(
7112                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7113                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7114                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7115 
7116     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7117   }
7118   case Instruction::UDiv:
7119   case Instruction::SDiv:
7120   case Instruction::URem:
7121   case Instruction::SRem:
7122     if (VF.isVector() && isPredicatedInst(I)) {
7123       const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
7124       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
7125         ScalarCost : SafeDivisorCost;
7126     }
7127     // We've proven all lanes safe to speculate, fall through.
7128     [[fallthrough]];
7129   case Instruction::Add:
7130   case Instruction::FAdd:
7131   case Instruction::Sub:
7132   case Instruction::FSub:
7133   case Instruction::Mul:
7134   case Instruction::FMul:
7135   case Instruction::FDiv:
7136   case Instruction::FRem:
7137   case Instruction::Shl:
7138   case Instruction::LShr:
7139   case Instruction::AShr:
7140   case Instruction::And:
7141   case Instruction::Or:
7142   case Instruction::Xor: {
7143     // Since we will replace the stride by 1 the multiplication should go away.
7144     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7145       return 0;
7146 
7147     // Detect reduction patterns
7148     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7149       return *RedCost;
7150 
7151     // Certain instructions can be cheaper to vectorize if they have a constant
7152     // second vector operand. One example of this are shifts on x86.
7153     Value *Op2 = I->getOperand(1);
7154     auto Op2Info = TTI.getOperandInfo(Op2);
7155     if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7156       Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
7157 
7158     SmallVector<const Value *, 4> Operands(I->operand_values());
7159     return TTI.getArithmeticInstrCost(
7160         I->getOpcode(), VectorTy, CostKind,
7161         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7162         Op2Info, Operands, I);
7163   }
7164   case Instruction::FNeg: {
7165     return TTI.getArithmeticInstrCost(
7166         I->getOpcode(), VectorTy, CostKind,
7167         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7168         {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
7169         I->getOperand(0), I);
7170   }
7171   case Instruction::Select: {
7172     SelectInst *SI = cast<SelectInst>(I);
7173     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7174     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7175 
7176     const Value *Op0, *Op1;
7177     using namespace llvm::PatternMatch;
7178     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7179                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7180       // select x, y, false --> x & y
7181       // select x, true, y --> x | y
7182       const auto [Op1VK, Op1VP] = TTI::getOperandInfo(Op0);
7183       const auto [Op2VK, Op2VP] = TTI::getOperandInfo(Op1);
7184       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7185               Op1->getType()->getScalarSizeInBits() == 1);
7186 
7187       SmallVector<const Value *, 2> Operands{Op0, Op1};
7188       return TTI.getArithmeticInstrCost(
7189           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7190           CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, I);
7191     }
7192 
7193     Type *CondTy = SI->getCondition()->getType();
7194     if (!ScalarCond)
7195       CondTy = VectorType::get(CondTy, VF);
7196 
7197     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7198     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7199       Pred = Cmp->getPredicate();
7200     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7201                                   CostKind, I);
7202   }
7203   case Instruction::ICmp:
7204   case Instruction::FCmp: {
7205     Type *ValTy = I->getOperand(0)->getType();
7206     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7207     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7208       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7209     VectorTy = ToVectorTy(ValTy, VF);
7210     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7211                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7212                                   I);
7213   }
7214   case Instruction::Store:
7215   case Instruction::Load: {
7216     ElementCount Width = VF;
7217     if (Width.isVector()) {
7218       InstWidening Decision = getWideningDecision(I, Width);
7219       assert(Decision != CM_Unknown &&
7220              "CM decision should be taken at this point");
7221       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
7222         return InstructionCost::getInvalid();
7223       if (Decision == CM_Scalarize)
7224         Width = ElementCount::getFixed(1);
7225     }
7226     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7227     return getMemoryInstructionCost(I, VF);
7228   }
7229   case Instruction::BitCast:
7230     if (I->getType()->isPointerTy())
7231       return 0;
7232     [[fallthrough]];
7233   case Instruction::ZExt:
7234   case Instruction::SExt:
7235   case Instruction::FPToUI:
7236   case Instruction::FPToSI:
7237   case Instruction::FPExt:
7238   case Instruction::PtrToInt:
7239   case Instruction::IntToPtr:
7240   case Instruction::SIToFP:
7241   case Instruction::UIToFP:
7242   case Instruction::Trunc:
7243   case Instruction::FPTrunc: {
7244     // Computes the CastContextHint from a Load/Store instruction.
7245     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7246       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7247              "Expected a load or a store!");
7248 
7249       if (VF.isScalar() || !TheLoop->contains(I))
7250         return TTI::CastContextHint::Normal;
7251 
7252       switch (getWideningDecision(I, VF)) {
7253       case LoopVectorizationCostModel::CM_GatherScatter:
7254         return TTI::CastContextHint::GatherScatter;
7255       case LoopVectorizationCostModel::CM_Interleave:
7256         return TTI::CastContextHint::Interleave;
7257       case LoopVectorizationCostModel::CM_Scalarize:
7258       case LoopVectorizationCostModel::CM_Widen:
7259         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7260                                         : TTI::CastContextHint::Normal;
7261       case LoopVectorizationCostModel::CM_Widen_Reverse:
7262         return TTI::CastContextHint::Reversed;
7263       case LoopVectorizationCostModel::CM_Unknown:
7264         llvm_unreachable("Instr did not go through cost modelling?");
7265       }
7266 
7267       llvm_unreachable("Unhandled case!");
7268     };
7269 
7270     unsigned Opcode = I->getOpcode();
7271     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7272     // For Trunc, the context is the only user, which must be a StoreInst.
7273     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7274       if (I->hasOneUse())
7275         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7276           CCH = ComputeCCH(Store);
7277     }
7278     // For Z/Sext, the context is the operand, which must be a LoadInst.
7279     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7280              Opcode == Instruction::FPExt) {
7281       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7282         CCH = ComputeCCH(Load);
7283     }
7284 
7285     // We optimize the truncation of induction variables having constant
7286     // integer steps. The cost of these truncations is the same as the scalar
7287     // operation.
7288     if (isOptimizableIVTruncate(I, VF)) {
7289       auto *Trunc = cast<TruncInst>(I);
7290       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7291                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7292     }
7293 
7294     // Detect reduction patterns
7295     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7296       return *RedCost;
7297 
7298     Type *SrcScalarTy = I->getOperand(0)->getType();
7299     Type *SrcVecTy =
7300         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7301     if (canTruncateToMinimalBitwidth(I, VF)) {
7302       // This cast is going to be shrunk. This may remove the cast or it might
7303       // turn it into slightly different cast. For example, if MinBW == 16,
7304       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7305       //
7306       // Calculate the modified src and dest types.
7307       Type *MinVecTy = VectorTy;
7308       if (Opcode == Instruction::Trunc) {
7309         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7310         VectorTy =
7311             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7312       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7313         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7314         VectorTy =
7315             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7316       }
7317     }
7318 
7319     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7320   }
7321   case Instruction::Call: {
7322     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7323       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7324         return *RedCost;
7325     bool NeedToScalarize;
7326     CallInst *CI = cast<CallInst>(I);
7327     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7328     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7329       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7330       return std::min(CallCost, IntrinsicCost);
7331     }
7332     return CallCost;
7333   }
7334   case Instruction::ExtractValue:
7335     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7336   case Instruction::Alloca:
7337     // We cannot easily widen alloca to a scalable alloca, as
7338     // the result would need to be a vector of pointers.
7339     if (VF.isScalable())
7340       return InstructionCost::getInvalid();
7341     [[fallthrough]];
7342   default:
7343     // This opcode is unknown. Assume that it is the same as 'mul'.
7344     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7345   } // end of switch.
7346 }
7347 
7348 char LoopVectorize::ID = 0;
7349 
7350 static const char lv_name[] = "Loop Vectorization";
7351 
7352 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7353 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7354 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7355 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7356 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7357 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7358 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7359 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7360 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7361 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7362 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7363 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7364 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7365 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7366 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7367 
7368 namespace llvm {
7369 
7370 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7371 
7372 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7373                               bool VectorizeOnlyWhenForced) {
7374   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7375 }
7376 
7377 } // end namespace llvm
7378 
7379 void LoopVectorizationCostModel::collectValuesToIgnore() {
7380   // Ignore ephemeral values.
7381   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7382 
7383   // Find all stores to invariant variables. Since they are going to sink
7384   // outside the loop we do not need calculate cost for them.
7385   for (BasicBlock *BB : TheLoop->blocks())
7386     for (Instruction &I : *BB) {
7387       StoreInst *SI;
7388       if ((SI = dyn_cast<StoreInst>(&I)) &&
7389           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7390         ValuesToIgnore.insert(&I);
7391     }
7392 
7393   // Ignore type-promoting instructions we identified during reduction
7394   // detection.
7395   for (const auto &Reduction : Legal->getReductionVars()) {
7396     const RecurrenceDescriptor &RedDes = Reduction.second;
7397     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7398     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7399   }
7400   // Ignore type-casting instructions we identified during induction
7401   // detection.
7402   for (const auto &Induction : Legal->getInductionVars()) {
7403     const InductionDescriptor &IndDes = Induction.second;
7404     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7405     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7406   }
7407 }
7408 
7409 void LoopVectorizationCostModel::collectInLoopReductions() {
7410   for (const auto &Reduction : Legal->getReductionVars()) {
7411     PHINode *Phi = Reduction.first;
7412     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7413 
7414     // We don't collect reductions that are type promoted (yet).
7415     if (RdxDesc.getRecurrenceType() != Phi->getType())
7416       continue;
7417 
7418     // If the target would prefer this reduction to happen "in-loop", then we
7419     // want to record it as such.
7420     unsigned Opcode = RdxDesc.getOpcode();
7421     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7422         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7423                                    TargetTransformInfo::ReductionFlags()))
7424       continue;
7425 
7426     // Check that we can correctly put the reductions into the loop, by
7427     // finding the chain of operations that leads from the phi to the loop
7428     // exit value.
7429     SmallVector<Instruction *, 4> ReductionOperations =
7430         RdxDesc.getReductionOpChain(Phi, TheLoop);
7431     bool InLoop = !ReductionOperations.empty();
7432     if (InLoop) {
7433       InLoopReductionChains[Phi] = ReductionOperations;
7434       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7435       Instruction *LastChain = Phi;
7436       for (auto *I : ReductionOperations) {
7437         InLoopReductionImmediateChains[I] = LastChain;
7438         LastChain = I;
7439       }
7440     }
7441     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7442                       << " reduction for phi: " << *Phi << "\n");
7443   }
7444 }
7445 
7446 // TODO: we could return a pair of values that specify the max VF and
7447 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7448 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7449 // doesn't have a cost model that can choose which plan to execute if
7450 // more than one is generated.
7451 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7452                                  LoopVectorizationCostModel &CM) {
7453   unsigned WidestType;
7454   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7455   return WidestVectorRegBits / WidestType;
7456 }
7457 
7458 VectorizationFactor
7459 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7460   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7461   ElementCount VF = UserVF;
7462   // Outer loop handling: They may require CFG and instruction level
7463   // transformations before even evaluating whether vectorization is profitable.
7464   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7465   // the vectorization pipeline.
7466   if (!OrigLoop->isInnermost()) {
7467     // If the user doesn't provide a vectorization factor, determine a
7468     // reasonable one.
7469     if (UserVF.isZero()) {
7470       VF = ElementCount::getFixed(determineVPlanVF(
7471           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7472               .getFixedValue(),
7473           CM));
7474       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7475 
7476       // Make sure we have a VF > 1 for stress testing.
7477       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7478         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7479                           << "overriding computed VF.\n");
7480         VF = ElementCount::getFixed(4);
7481       }
7482     }
7483     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7484     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7485            "VF needs to be a power of two");
7486     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7487                       << "VF " << VF << " to build VPlans.\n");
7488     buildVPlans(VF, VF);
7489 
7490     // For VPlan build stress testing, we bail out after VPlan construction.
7491     if (VPlanBuildStressTest)
7492       return VectorizationFactor::Disabled();
7493 
7494     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7495   }
7496 
7497   LLVM_DEBUG(
7498       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7499                 "VPlan-native path.\n");
7500   return VectorizationFactor::Disabled();
7501 }
7502 
7503 std::optional<VectorizationFactor>
7504 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7505   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7506   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7507   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7508     return std::nullopt;
7509 
7510   // Invalidate interleave groups if all blocks of loop will be predicated.
7511   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7512       !useMaskedInterleavedAccesses(*TTI)) {
7513     LLVM_DEBUG(
7514         dbgs()
7515         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7516            "which requires masked-interleaved support.\n");
7517     if (CM.InterleaveInfo.invalidateGroups())
7518       // Invalidating interleave groups also requires invalidating all decisions
7519       // based on them, which includes widening decisions and uniform and scalar
7520       // values.
7521       CM.invalidateCostModelingDecisions();
7522   }
7523 
7524   ElementCount MaxUserVF =
7525       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7526   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7527   if (!UserVF.isZero() && UserVFIsLegal) {
7528     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7529            "VF needs to be a power of two");
7530     // Collect the instructions (and their associated costs) that will be more
7531     // profitable to scalarize.
7532     if (CM.selectUserVectorizationFactor(UserVF)) {
7533       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7534       CM.collectInLoopReductions();
7535       buildVPlansWithVPRecipes(UserVF, UserVF);
7536       LLVM_DEBUG(printPlans(dbgs()));
7537       return {{UserVF, 0, 0}};
7538     } else
7539       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7540                               "InvalidCost", ORE, OrigLoop);
7541   }
7542 
7543   // Populate the set of Vectorization Factor Candidates.
7544   ElementCountSet VFCandidates;
7545   for (auto VF = ElementCount::getFixed(1);
7546        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7547     VFCandidates.insert(VF);
7548   for (auto VF = ElementCount::getScalable(1);
7549        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7550     VFCandidates.insert(VF);
7551 
7552   for (const auto &VF : VFCandidates) {
7553     // Collect Uniform and Scalar instructions after vectorization with VF.
7554     CM.collectUniformsAndScalars(VF);
7555 
7556     // Collect the instructions (and their associated costs) that will be more
7557     // profitable to scalarize.
7558     if (VF.isVector())
7559       CM.collectInstsToScalarize(VF);
7560   }
7561 
7562   CM.collectInLoopReductions();
7563   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7564   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7565 
7566   LLVM_DEBUG(printPlans(dbgs()));
7567   if (!MaxFactors.hasVector())
7568     return VectorizationFactor::Disabled();
7569 
7570   // Select the optimal vectorization factor.
7571   VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);
7572   assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7573   return VF;
7574 }
7575 
7576 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7577   assert(count_if(VPlans,
7578                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7579              1 &&
7580          "Best VF has not a single VPlan.");
7581 
7582   for (const VPlanPtr &Plan : VPlans) {
7583     if (Plan->hasVF(VF))
7584       return *Plan.get();
7585   }
7586   llvm_unreachable("No plan found!");
7587 }
7588 
7589 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7590   SmallVector<Metadata *, 4> MDs;
7591   // Reserve first location for self reference to the LoopID metadata node.
7592   MDs.push_back(nullptr);
7593   bool IsUnrollMetadata = false;
7594   MDNode *LoopID = L->getLoopID();
7595   if (LoopID) {
7596     // First find existing loop unrolling disable metadata.
7597     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7598       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7599       if (MD) {
7600         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7601         IsUnrollMetadata =
7602             S && S->getString().startswith("llvm.loop.unroll.disable");
7603       }
7604       MDs.push_back(LoopID->getOperand(i));
7605     }
7606   }
7607 
7608   if (!IsUnrollMetadata) {
7609     // Add runtime unroll disable metadata.
7610     LLVMContext &Context = L->getHeader()->getContext();
7611     SmallVector<Metadata *, 1> DisableOperands;
7612     DisableOperands.push_back(
7613         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7614     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7615     MDs.push_back(DisableNode);
7616     MDNode *NewLoopID = MDNode::get(Context, MDs);
7617     // Set operand 0 to refer to the loop id itself.
7618     NewLoopID->replaceOperandWith(0, NewLoopID);
7619     L->setLoopID(NewLoopID);
7620   }
7621 }
7622 
7623 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7624                                            VPlan &BestVPlan,
7625                                            InnerLoopVectorizer &ILV,
7626                                            DominatorTree *DT,
7627                                            bool IsEpilogueVectorization) {
7628   assert(BestVPlan.hasVF(BestVF) &&
7629          "Trying to execute plan with unsupported VF");
7630   assert(BestVPlan.hasUF(BestUF) &&
7631          "Trying to execute plan with unsupported UF");
7632 
7633   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7634                     << '\n');
7635 
7636   // Workaround!  Compute the trip count of the original loop and cache it
7637   // before we start modifying the CFG.  This code has a systemic problem
7638   // wherein it tries to run analysis over partially constructed IR; this is
7639   // wrong, and not simply for SCEV.  The trip count of the original loop
7640   // simply happens to be prone to hitting this in practice.  In theory, we
7641   // can hit the same issue for any SCEV, or ValueTracking query done during
7642   // mutation.  See PR49900.
7643   ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader());
7644 
7645   if (!IsEpilogueVectorization)
7646     VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
7647 
7648   // Perform the actual loop transformation.
7649 
7650   // 1. Set up the skeleton for vectorization, including vector pre-header and
7651   // middle block. The vector loop is created during VPlan execution.
7652   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7653   Value *CanonicalIVStartValue;
7654   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7655       ILV.createVectorizedLoopSkeleton();
7656 
7657   // Only use noalias metadata when using memory checks guaranteeing no overlap
7658   // across all iterations.
7659   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7660   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7661       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7662 
7663     //  We currently don't use LoopVersioning for the actual loop cloning but we
7664     //  still use it to add the noalias metadata.
7665     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7666     //        metadata.
7667     State.LVer = std::make_unique<LoopVersioning>(
7668         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7669         PSE.getSE());
7670     State.LVer->prepareNoAliasMetadata();
7671   }
7672 
7673   ILV.collectPoisonGeneratingRecipes(State);
7674 
7675   ILV.printDebugTracesAtStart();
7676 
7677   //===------------------------------------------------===//
7678   //
7679   // Notice: any optimization or new instruction that go
7680   // into the code below should also be implemented in
7681   // the cost-model.
7682   //
7683   //===------------------------------------------------===//
7684 
7685   // 2. Copy and widen instructions from the old loop into the new loop.
7686   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7687                              ILV.getOrCreateVectorTripCount(nullptr),
7688                              CanonicalIVStartValue, State,
7689                              IsEpilogueVectorization);
7690 
7691   BestVPlan.execute(&State);
7692 
7693   // Keep all loop hints from the original loop on the vector loop (we'll
7694   // replace the vectorizer-specific hints below).
7695   MDNode *OrigLoopID = OrigLoop->getLoopID();
7696 
7697   std::optional<MDNode *> VectorizedLoopID =
7698       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7699                                       LLVMLoopVectorizeFollowupVectorized});
7700 
7701   VPBasicBlock *HeaderVPBB =
7702       BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7703   Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7704   if (VectorizedLoopID)
7705     L->setLoopID(*VectorizedLoopID);
7706   else {
7707     // Keep all loop hints from the original loop on the vector loop (we'll
7708     // replace the vectorizer-specific hints below).
7709     if (MDNode *LID = OrigLoop->getLoopID())
7710       L->setLoopID(LID);
7711 
7712     LoopVectorizeHints Hints(L, true, *ORE);
7713     Hints.setAlreadyVectorized();
7714   }
7715   AddRuntimeUnrollDisableMetaData(L);
7716 
7717   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7718   //    predication, updating analyses.
7719   ILV.fixVectorizedLoop(State, BestVPlan);
7720 
7721   ILV.printDebugTracesAtEnd();
7722 }
7723 
7724 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7725 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7726   for (const auto &Plan : VPlans)
7727     if (PrintVPlansInDotFormat)
7728       Plan->printDOT(O);
7729     else
7730       Plan->print(O);
7731 }
7732 #endif
7733 
7734 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7735 
7736 //===--------------------------------------------------------------------===//
7737 // EpilogueVectorizerMainLoop
7738 //===--------------------------------------------------------------------===//
7739 
7740 /// This function is partially responsible for generating the control flow
7741 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7742 std::pair<BasicBlock *, Value *>
7743 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7744   createVectorLoopSkeleton("");
7745 
7746   // Generate the code to check the minimum iteration count of the vector
7747   // epilogue (see below).
7748   EPI.EpilogueIterationCountCheck =
7749       emitIterationCountCheck(LoopScalarPreHeader, true);
7750   EPI.EpilogueIterationCountCheck->setName("iter.check");
7751 
7752   // Generate the code to check any assumptions that we've made for SCEV
7753   // expressions.
7754   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7755 
7756   // Generate the code that checks at runtime if arrays overlap. We put the
7757   // checks into a separate block to make the more common case of few elements
7758   // faster.
7759   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7760 
7761   // Generate the iteration count check for the main loop, *after* the check
7762   // for the epilogue loop, so that the path-length is shorter for the case
7763   // that goes directly through the vector epilogue. The longer-path length for
7764   // the main loop is compensated for, by the gain from vectorizing the larger
7765   // trip count. Note: the branch will get updated later on when we vectorize
7766   // the epilogue.
7767   EPI.MainLoopIterationCountCheck =
7768       emitIterationCountCheck(LoopScalarPreHeader, false);
7769 
7770   // Generate the induction variable.
7771   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7772 
7773   // Skip induction resume value creation here because they will be created in
7774   // the second pass for the scalar loop. The induction resume values for the
7775   // inductions in the epilogue loop are created before executing the plan for
7776   // the epilogue loop.
7777 
7778   return {completeLoopSkeleton(), nullptr};
7779 }
7780 
7781 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7782   LLVM_DEBUG({
7783     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7784            << "Main Loop VF:" << EPI.MainLoopVF
7785            << ", Main Loop UF:" << EPI.MainLoopUF
7786            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7787            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7788   });
7789 }
7790 
7791 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7792   DEBUG_WITH_TYPE(VerboseDebug, {
7793     dbgs() << "intermediate fn:\n"
7794            << *OrigLoop->getHeader()->getParent() << "\n";
7795   });
7796 }
7797 
7798 BasicBlock *
7799 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7800                                                     bool ForEpilogue) {
7801   assert(Bypass && "Expected valid bypass basic block.");
7802   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7803   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7804   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
7805   // Reuse existing vector loop preheader for TC checks.
7806   // Note that new preheader block is generated for vector loop.
7807   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7808   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7809 
7810   // Generate code to check if the loop's trip count is less than VF * UF of the
7811   // main vector loop.
7812   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7813       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7814 
7815   Value *CheckMinIters = Builder.CreateICmp(
7816       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7817       "min.iters.check");
7818 
7819   if (!ForEpilogue)
7820     TCCheckBlock->setName("vector.main.loop.iter.check");
7821 
7822   // Create new preheader for vector loop.
7823   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7824                                    DT, LI, nullptr, "vector.ph");
7825 
7826   if (ForEpilogue) {
7827     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7828                                  DT->getNode(Bypass)->getIDom()) &&
7829            "TC check is expected to dominate Bypass");
7830 
7831     // Update dominator for Bypass & LoopExit.
7832     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7833     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7834       // For loops with multiple exits, there's no edge from the middle block
7835       // to exit blocks (as the epilogue must run) and thus no need to update
7836       // the immediate dominator of the exit blocks.
7837       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7838 
7839     LoopBypassBlocks.push_back(TCCheckBlock);
7840 
7841     // Save the trip count so we don't have to regenerate it in the
7842     // vec.epilog.iter.check. This is safe to do because the trip count
7843     // generated here dominates the vector epilog iter check.
7844     EPI.TripCount = Count;
7845   }
7846 
7847   ReplaceInstWithInst(
7848       TCCheckBlock->getTerminator(),
7849       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7850 
7851   return TCCheckBlock;
7852 }
7853 
7854 //===--------------------------------------------------------------------===//
7855 // EpilogueVectorizerEpilogueLoop
7856 //===--------------------------------------------------------------------===//
7857 
7858 /// This function is partially responsible for generating the control flow
7859 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7860 std::pair<BasicBlock *, Value *>
7861 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7862   createVectorLoopSkeleton("vec.epilog.");
7863 
7864   // Now, compare the remaining count and if there aren't enough iterations to
7865   // execute the vectorized epilogue skip to the scalar part.
7866   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7867   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7868   LoopVectorPreHeader =
7869       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7870                  LI, nullptr, "vec.epilog.ph");
7871   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7872                                           VecEpilogueIterationCountCheck);
7873 
7874   // Adjust the control flow taking the state info from the main loop
7875   // vectorization into account.
7876   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7877          "expected this to be saved from the previous pass.");
7878   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7879       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7880 
7881   DT->changeImmediateDominator(LoopVectorPreHeader,
7882                                EPI.MainLoopIterationCountCheck);
7883 
7884   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7885       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7886 
7887   if (EPI.SCEVSafetyCheck)
7888     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7889         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7890   if (EPI.MemSafetyCheck)
7891     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7892         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7893 
7894   DT->changeImmediateDominator(
7895       VecEpilogueIterationCountCheck,
7896       VecEpilogueIterationCountCheck->getSinglePredecessor());
7897 
7898   DT->changeImmediateDominator(LoopScalarPreHeader,
7899                                EPI.EpilogueIterationCountCheck);
7900   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7901     // If there is an epilogue which must run, there's no edge from the
7902     // middle block to exit blocks  and thus no need to update the immediate
7903     // dominator of the exit blocks.
7904     DT->changeImmediateDominator(LoopExitBlock,
7905                                  EPI.EpilogueIterationCountCheck);
7906 
7907   // Keep track of bypass blocks, as they feed start values to the induction and
7908   // reduction phis in the scalar loop preheader.
7909   if (EPI.SCEVSafetyCheck)
7910     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7911   if (EPI.MemSafetyCheck)
7912     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7913   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7914 
7915   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7916   // reductions which merge control-flow from the latch block and the middle
7917   // block. Update the incoming values here and move the Phi into the preheader.
7918   SmallVector<PHINode *, 4> PhisInBlock;
7919   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7920     PhisInBlock.push_back(&Phi);
7921 
7922   for (PHINode *Phi : PhisInBlock) {
7923     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7924     Phi->replaceIncomingBlockWith(
7925         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7926         VecEpilogueIterationCountCheck);
7927 
7928     // If the phi doesn't have an incoming value from the
7929     // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7930     // value and also those from other check blocks. This is needed for
7931     // reduction phis only.
7932     if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {
7933           return EPI.EpilogueIterationCountCheck == IncB;
7934         }))
7935       continue;
7936     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7937     if (EPI.SCEVSafetyCheck)
7938       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7939     if (EPI.MemSafetyCheck)
7940       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7941   }
7942 
7943   // Generate a resume induction for the vector epilogue and put it in the
7944   // vector epilogue preheader
7945   Type *IdxTy = Legal->getWidestInductionType();
7946   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7947                                          LoopVectorPreHeader->getFirstNonPHI());
7948   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7949   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7950                            EPI.MainLoopIterationCountCheck);
7951 
7952   // Generate induction resume values. These variables save the new starting
7953   // indexes for the scalar loop. They are used to test if there are any tail
7954   // iterations left once the vector loop has completed.
7955   // Note that when the vectorized epilogue is skipped due to iteration count
7956   // check, then the resume value for the induction variable comes from
7957   // the trip count of the main vector loop, hence passing the AdditionalBypass
7958   // argument.
7959   createInductionResumeValues({VecEpilogueIterationCountCheck,
7960                                EPI.VectorTripCount} /* AdditionalBypass */);
7961 
7962   return {completeLoopSkeleton(), EPResumeVal};
7963 }
7964 
7965 BasicBlock *
7966 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7967     BasicBlock *Bypass, BasicBlock *Insert) {
7968 
7969   assert(EPI.TripCount &&
7970          "Expected trip count to have been safed in the first pass.");
7971   assert(
7972       (!isa<Instruction>(EPI.TripCount) ||
7973        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7974       "saved trip count does not dominate insertion point.");
7975   Value *TC = EPI.TripCount;
7976   IRBuilder<> Builder(Insert->getTerminator());
7977   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7978 
7979   // Generate code to check if the loop's trip count is less than VF * UF of the
7980   // vector epilogue loop.
7981   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
7982       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7983 
7984   Value *CheckMinIters =
7985       Builder.CreateICmp(P, Count,
7986                          createStepForVF(Builder, Count->getType(),
7987                                          EPI.EpilogueVF, EPI.EpilogueUF),
7988                          "min.epilog.iters.check");
7989 
7990   ReplaceInstWithInst(
7991       Insert->getTerminator(),
7992       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7993 
7994   LoopBypassBlocks.push_back(Insert);
7995   return Insert;
7996 }
7997 
7998 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7999   LLVM_DEBUG({
8000     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8001            << "Epilogue Loop VF:" << EPI.EpilogueVF
8002            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8003   });
8004 }
8005 
8006 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8007   DEBUG_WITH_TYPE(VerboseDebug, {
8008     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8009   });
8010 }
8011 
8012 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8013     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8014   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8015   bool PredicateAtRangeStart = Predicate(Range.Start);
8016 
8017   for (ElementCount TmpVF = Range.Start * 2;
8018        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8019     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8020       Range.End = TmpVF;
8021       break;
8022     }
8023 
8024   return PredicateAtRangeStart;
8025 }
8026 
8027 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8028 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8029 /// of VF's starting at a given VF and extending it as much as possible. Each
8030 /// vectorization decision can potentially shorten this sub-range during
8031 /// buildVPlan().
8032 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8033                                            ElementCount MaxVF) {
8034   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8035   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8036     VFRange SubRange = {VF, MaxVFPlusOne};
8037     VPlans.push_back(buildVPlan(SubRange));
8038     VF = SubRange.End;
8039   }
8040 }
8041 
8042 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8043                                          VPlanPtr &Plan) {
8044   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8045 
8046   // Look for cached value.
8047   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8048   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8049   if (ECEntryIt != EdgeMaskCache.end())
8050     return ECEntryIt->second;
8051 
8052   VPValue *SrcMask = createBlockInMask(Src, Plan);
8053 
8054   // The terminator has to be a branch inst!
8055   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8056   assert(BI && "Unexpected terminator found");
8057 
8058   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8059     return EdgeMaskCache[Edge] = SrcMask;
8060 
8061   // If source is an exiting block, we know the exit edge is dynamically dead
8062   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8063   // adding uses of an otherwise potentially dead instruction.
8064   if (OrigLoop->isLoopExiting(Src))
8065     return EdgeMaskCache[Edge] = SrcMask;
8066 
8067   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8068   assert(EdgeMask && "No Edge Mask found for condition");
8069 
8070   if (BI->getSuccessor(0) != Dst)
8071     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8072 
8073   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8074     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8075     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8076     // The select version does not introduce new UB if SrcMask is false and
8077     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8078     VPValue *False = Plan->getOrAddVPValue(
8079         ConstantInt::getFalse(BI->getCondition()->getType()));
8080     EdgeMask =
8081         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8082   }
8083 
8084   return EdgeMaskCache[Edge] = EdgeMask;
8085 }
8086 
8087 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8088   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8089 
8090   // Look for cached value.
8091   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8092   if (BCEntryIt != BlockMaskCache.end())
8093     return BCEntryIt->second;
8094 
8095   // All-one mask is modelled as no-mask following the convention for masked
8096   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8097   VPValue *BlockMask = nullptr;
8098 
8099   if (OrigLoop->getHeader() == BB) {
8100     if (!CM.blockNeedsPredicationForAnyReason(BB))
8101       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8102 
8103     assert(CM.foldTailByMasking() && "must fold the tail");
8104 
8105     // If we're using the active lane mask for control flow, then we get the
8106     // mask from the active lane mask PHI that is cached in the VPlan.
8107     PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask();
8108     if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow)
8109       return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi();
8110 
8111     // Introduce the early-exit compare IV <= BTC to form header block mask.
8112     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8113     // constructing the desired canonical IV in the header block as its first
8114     // non-phi instructions.
8115 
8116     VPBasicBlock *HeaderVPBB =
8117         Plan->getVectorLoopRegion()->getEntryBasicBlock();
8118     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8119     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8120     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8121 
8122     VPBuilder::InsertPointGuard Guard(Builder);
8123     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8124     if (EmitGetActiveLaneMask != PredicationStyle::None) {
8125       VPValue *TC = Plan->getOrCreateTripCount();
8126       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
8127                                        nullptr, "active.lane.mask");
8128     } else {
8129       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8130       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8131     }
8132     return BlockMaskCache[BB] = BlockMask;
8133   }
8134 
8135   // This is the block mask. We OR all incoming edges.
8136   for (auto *Predecessor : predecessors(BB)) {
8137     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8138     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8139       return BlockMaskCache[BB] = EdgeMask;
8140 
8141     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8142       BlockMask = EdgeMask;
8143       continue;
8144     }
8145 
8146     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8147   }
8148 
8149   return BlockMaskCache[BB] = BlockMask;
8150 }
8151 
8152 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8153                                                 ArrayRef<VPValue *> Operands,
8154                                                 VFRange &Range,
8155                                                 VPlanPtr &Plan) {
8156   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8157          "Must be called with either a load or store");
8158 
8159   auto willWiden = [&](ElementCount VF) -> bool {
8160     LoopVectorizationCostModel::InstWidening Decision =
8161         CM.getWideningDecision(I, VF);
8162     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8163            "CM decision should be taken at this point.");
8164     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8165       return true;
8166     if (CM.isScalarAfterVectorization(I, VF) ||
8167         CM.isProfitableToScalarize(I, VF))
8168       return false;
8169     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8170   };
8171 
8172   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8173     return nullptr;
8174 
8175   VPValue *Mask = nullptr;
8176   if (Legal->isMaskRequired(I))
8177     Mask = createBlockInMask(I->getParent(), Plan);
8178 
8179   // Determine if the pointer operand of the access is either consecutive or
8180   // reverse consecutive.
8181   LoopVectorizationCostModel::InstWidening Decision =
8182       CM.getWideningDecision(I, Range.Start);
8183   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8184   bool Consecutive =
8185       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8186 
8187   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8188     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8189                                               Consecutive, Reverse);
8190 
8191   StoreInst *Store = cast<StoreInst>(I);
8192   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8193                                             Mask, Consecutive, Reverse);
8194 }
8195 
8196 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8197 /// insert a recipe to expand the step for the induction recipe.
8198 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
8199     PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
8200     const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
8201     VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
8202   // Returns true if an instruction \p I should be scalarized instead of
8203   // vectorized for the chosen vectorization factor.
8204   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8205     return CM.isScalarAfterVectorization(I, VF) ||
8206            CM.isProfitableToScalarize(I, VF);
8207   };
8208 
8209   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8210       [&](ElementCount VF) {
8211         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8212       },
8213       Range);
8214   assert(IndDesc.getStartValue() ==
8215          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8216   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8217          "step must be loop invariant");
8218 
8219   VPValue *Step =
8220       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8221   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8222     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
8223                                              !NeedsScalarIVOnly);
8224   }
8225   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8226   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
8227                                            !NeedsScalarIVOnly);
8228 }
8229 
8230 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8231     PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8232 
8233   // Check if this is an integer or fp induction. If so, build the recipe that
8234   // produces its scalar and vector values.
8235   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8236     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
8237                                        *PSE.getSE(), *OrigLoop, Range);
8238 
8239   // Check if this is pointer induction. If so, build the recipe for it.
8240   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8241     VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep(),
8242                                                            *PSE.getSE());
8243     assert(isa<SCEVConstant>(II->getStep()));
8244     return new VPWidenPointerInductionRecipe(
8245         Phi, Operands[0], Step, *II,
8246         LoopVectorizationPlanner::getDecisionAndClampRange(
8247             [&](ElementCount VF) {
8248               return CM.isScalarAfterVectorization(Phi, VF);
8249             },
8250             Range));
8251   }
8252   return nullptr;
8253 }
8254 
8255 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8256     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8257   // Optimize the special case where the source is a constant integer
8258   // induction variable. Notice that we can only optimize the 'trunc' case
8259   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8260   // (c) other casts depend on pointer size.
8261 
8262   // Determine whether \p K is a truncation based on an induction variable that
8263   // can be optimized.
8264   auto isOptimizableIVTruncate =
8265       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8266     return [=](ElementCount VF) -> bool {
8267       return CM.isOptimizableIVTruncate(K, VF);
8268     };
8269   };
8270 
8271   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8272           isOptimizableIVTruncate(I), Range)) {
8273 
8274     auto *Phi = cast<PHINode>(I->getOperand(0));
8275     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8276     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8277     return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
8278                                        *PSE.getSE(), *OrigLoop, Range);
8279   }
8280   return nullptr;
8281 }
8282 
8283 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8284                                                 ArrayRef<VPValue *> Operands,
8285                                                 VPlanPtr &Plan) {
8286   // If all incoming values are equal, the incoming VPValue can be used directly
8287   // instead of creating a new VPBlendRecipe.
8288   if (llvm::all_equal(Operands))
8289     return Operands[0];
8290 
8291   unsigned NumIncoming = Phi->getNumIncomingValues();
8292   // For in-loop reductions, we do not need to create an additional select.
8293   VPValue *InLoopVal = nullptr;
8294   for (unsigned In = 0; In < NumIncoming; In++) {
8295     PHINode *PhiOp =
8296         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8297     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8298       assert(!InLoopVal && "Found more than one in-loop reduction!");
8299       InLoopVal = Operands[In];
8300     }
8301   }
8302 
8303   assert((!InLoopVal || NumIncoming == 2) &&
8304          "Found an in-loop reduction for PHI with unexpected number of "
8305          "incoming values");
8306   if (InLoopVal)
8307     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8308 
8309   // We know that all PHIs in non-header blocks are converted into selects, so
8310   // we don't have to worry about the insertion order and we can just use the
8311   // builder. At this point we generate the predication tree. There may be
8312   // duplications since this is a simple recursive scan, but future
8313   // optimizations will clean it up.
8314   SmallVector<VPValue *, 2> OperandsWithMask;
8315 
8316   for (unsigned In = 0; In < NumIncoming; In++) {
8317     VPValue *EdgeMask =
8318       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8319     assert((EdgeMask || NumIncoming == 1) &&
8320            "Multiple predecessors with one having a full mask");
8321     OperandsWithMask.push_back(Operands[In]);
8322     if (EdgeMask)
8323       OperandsWithMask.push_back(EdgeMask);
8324   }
8325   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8326 }
8327 
8328 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8329                                                    ArrayRef<VPValue *> Operands,
8330                                                    VFRange &Range) const {
8331 
8332   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8333       [this, CI](ElementCount VF) {
8334         return CM.isScalarWithPredication(CI, VF);
8335       },
8336       Range);
8337 
8338   if (IsPredicated)
8339     return nullptr;
8340 
8341   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8342   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8343              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8344              ID == Intrinsic::pseudoprobe ||
8345              ID == Intrinsic::experimental_noalias_scope_decl))
8346     return nullptr;
8347 
8348   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8349 
8350   // Is it beneficial to perform intrinsic call compared to lib call?
8351   bool ShouldUseVectorIntrinsic =
8352       ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8353                 [&](ElementCount VF) -> bool {
8354                   bool NeedToScalarize = false;
8355                   // Is it beneficial to perform intrinsic call compared to lib
8356                   // call?
8357                   InstructionCost CallCost =
8358                       CM.getVectorCallCost(CI, VF, NeedToScalarize);
8359                   InstructionCost IntrinsicCost =
8360                       CM.getVectorIntrinsicCost(CI, VF);
8361                   return IntrinsicCost <= CallCost;
8362                 },
8363                 Range);
8364   if (ShouldUseVectorIntrinsic)
8365     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);
8366 
8367   // Is better to call a vectorized version of the function than to to scalarize
8368   // the call?
8369   auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8370       [&](ElementCount VF) -> bool {
8371         // The following case may be scalarized depending on the VF.
8372         // The flag shows whether we can use a usual Call for vectorized
8373         // version of the instruction.
8374         bool NeedToScalarize = false;
8375         CM.getVectorCallCost(CI, VF, NeedToScalarize);
8376         return !NeedToScalarize;
8377       },
8378       Range);
8379   if (ShouldUseVectorCall)
8380     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
8381                                  Intrinsic::not_intrinsic);
8382 
8383   return nullptr;
8384 }
8385 
8386 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8387   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8388          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8389   // Instruction should be widened, unless it is scalar after vectorization,
8390   // scalarization is profitable or it is predicated.
8391   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8392     return CM.isScalarAfterVectorization(I, VF) ||
8393            CM.isProfitableToScalarize(I, VF) ||
8394            CM.isScalarWithPredication(I, VF);
8395   };
8396   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8397                                                              Range);
8398 }
8399 
8400 VPRecipeBase *VPRecipeBuilder::tryToWiden(Instruction *I,
8401                                           ArrayRef<VPValue *> Operands,
8402                                           VPBasicBlock *VPBB, VPlanPtr &Plan) {
8403   switch (I->getOpcode()) {
8404   default:
8405     return nullptr;
8406   case Instruction::SDiv:
8407   case Instruction::UDiv:
8408   case Instruction::SRem:
8409   case Instruction::URem: {
8410     // If not provably safe, use a select to form a safe divisor before widening the
8411     // div/rem operation itself.  Otherwise fall through to general handling below.
8412     if (CM.isPredicatedInst(I)) {
8413       SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8414       VPValue *Mask = createBlockInMask(I->getParent(), Plan);
8415       VPValue *One =
8416         Plan->getOrAddExternalDef(ConstantInt::get(I->getType(), 1u, false));
8417       auto *SafeRHS =
8418          new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8419                            I->getDebugLoc());
8420       VPBB->appendRecipe(SafeRHS);
8421       Ops[1] = SafeRHS;
8422       return new VPWidenRecipe(*I, make_range(Ops.begin(), Ops.end()));
8423     }
8424     LLVM_FALLTHROUGH;
8425   }
8426   case Instruction::Add:
8427   case Instruction::And:
8428   case Instruction::AShr:
8429   case Instruction::BitCast:
8430   case Instruction::FAdd:
8431   case Instruction::FCmp:
8432   case Instruction::FDiv:
8433   case Instruction::FMul:
8434   case Instruction::FNeg:
8435   case Instruction::FPExt:
8436   case Instruction::FPToSI:
8437   case Instruction::FPToUI:
8438   case Instruction::FPTrunc:
8439   case Instruction::FRem:
8440   case Instruction::FSub:
8441   case Instruction::ICmp:
8442   case Instruction::IntToPtr:
8443   case Instruction::LShr:
8444   case Instruction::Mul:
8445   case Instruction::Or:
8446   case Instruction::PtrToInt:
8447   case Instruction::Select:
8448   case Instruction::SExt:
8449   case Instruction::Shl:
8450   case Instruction::SIToFP:
8451   case Instruction::Sub:
8452   case Instruction::Trunc:
8453   case Instruction::UIToFP:
8454   case Instruction::Xor:
8455   case Instruction::ZExt:
8456   case Instruction::Freeze:
8457     return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8458   };
8459 }
8460 
8461 void VPRecipeBuilder::fixHeaderPhis() {
8462   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8463   for (VPHeaderPHIRecipe *R : PhisToFix) {
8464     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8465     VPRecipeBase *IncR =
8466         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8467     R->addOperand(IncR->getVPSingleValue());
8468   }
8469 }
8470 
8471 VPBasicBlock *VPRecipeBuilder::handleReplication(
8472     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8473     VPlanPtr &Plan) {
8474   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8475       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8476       Range);
8477 
8478   bool IsPredicated = CM.isPredicatedInst(I);
8479 
8480   // Even if the instruction is not marked as uniform, there are certain
8481   // intrinsic calls that can be effectively treated as such, so we check for
8482   // them here. Conservatively, we only do this for scalable vectors, since
8483   // for fixed-width VFs we can always fall back on full scalarization.
8484   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8485     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8486     case Intrinsic::assume:
8487     case Intrinsic::lifetime_start:
8488     case Intrinsic::lifetime_end:
8489       // For scalable vectors if one of the operands is variant then we still
8490       // want to mark as uniform, which will generate one instruction for just
8491       // the first lane of the vector. We can't scalarize the call in the same
8492       // way as for fixed-width vectors because we don't know how many lanes
8493       // there are.
8494       //
8495       // The reasons for doing it this way for scalable vectors are:
8496       //   1. For the assume intrinsic generating the instruction for the first
8497       //      lane is still be better than not generating any at all. For
8498       //      example, the input may be a splat across all lanes.
8499       //   2. For the lifetime start/end intrinsics the pointer operand only
8500       //      does anything useful when the input comes from a stack object,
8501       //      which suggests it should always be uniform. For non-stack objects
8502       //      the effect is to poison the object, which still allows us to
8503       //      remove the call.
8504       IsUniform = true;
8505       break;
8506     default:
8507       break;
8508     }
8509   }
8510 
8511   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8512                                        IsUniform, IsPredicated);
8513 
8514   // Find if I uses a predicated instruction. If so, it will use its scalar
8515   // value. Avoid hoisting the insert-element which packs the scalar value into
8516   // a vector value, as that happens iff all users use the vector value.
8517   for (VPValue *Op : Recipe->operands()) {
8518     auto *PredR =
8519         dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDefiningRecipe());
8520     if (!PredR)
8521       continue;
8522     auto *RepR = cast<VPReplicateRecipe>(
8523         PredR->getOperand(0)->getDefiningRecipe());
8524     assert(RepR->isPredicated() &&
8525            "expected Replicate recipe to be predicated");
8526     RepR->setAlsoPack(false);
8527   }
8528 
8529   // Finalize the recipe for Instr, first if it is not predicated.
8530   if (!IsPredicated) {
8531     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8532     setRecipe(I, Recipe);
8533     Plan->addVPValue(I, Recipe);
8534     VPBB->appendRecipe(Recipe);
8535     return VPBB;
8536   }
8537   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8538 
8539   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8540   assert(SingleSucc && "VPBB must have a single successor when handling "
8541                        "predicated replication.");
8542   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8543   // Record predicated instructions for above packing optimizations.
8544   VPBlockBase *Region = createReplicateRegion(Recipe, Plan);
8545   VPBlockUtils::insertBlockAfter(Region, VPBB);
8546   auto *RegSucc = new VPBasicBlock();
8547   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8548   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8549   return RegSucc;
8550 }
8551 
8552 VPRegionBlock *
8553 VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe,
8554                                        VPlanPtr &Plan) {
8555   Instruction *Instr = PredRecipe->getUnderlyingInstr();
8556   // Instructions marked for predication are replicated and placed under an
8557   // if-then construct to prevent side-effects.
8558   // Generate recipes to compute the block mask for this region.
8559   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8560 
8561   // Build the triangular if-then region.
8562   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8563   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8564   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8565   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8566   auto *PHIRecipe = Instr->getType()->isVoidTy()
8567                         ? nullptr
8568                         : new VPPredInstPHIRecipe(PredRecipe);
8569   if (PHIRecipe) {
8570     setRecipe(Instr, PHIRecipe);
8571     Plan->addVPValue(Instr, PHIRecipe);
8572   } else {
8573     setRecipe(Instr, PredRecipe);
8574     Plan->addVPValue(Instr, PredRecipe);
8575   }
8576 
8577   auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8578   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8579   VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
8580 
8581   // Note: first set Entry as region entry and then connect successors starting
8582   // from it in order, to propagate the "parent" of each VPBasicBlock.
8583   VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
8584   VPBlockUtils::connectBlocks(Pred, Exiting);
8585 
8586   return Region;
8587 }
8588 
8589 VPRecipeOrVPValueTy
8590 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8591                                         ArrayRef<VPValue *> Operands,
8592                                         VFRange &Range, VPBasicBlock *VPBB,
8593                                         VPlanPtr &Plan) {
8594   // First, check for specific widening recipes that deal with inductions, Phi
8595   // nodes, calls and memory operations.
8596   VPRecipeBase *Recipe;
8597   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8598     if (Phi->getParent() != OrigLoop->getHeader())
8599       return tryToBlend(Phi, Operands, Plan);
8600 
8601     // Always record recipes for header phis. Later first-order recurrence phis
8602     // can have earlier phis as incoming values.
8603     recordRecipeOf(Phi);
8604 
8605     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8606       return toVPRecipeResult(Recipe);
8607 
8608     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8609     assert((Legal->isReductionVariable(Phi) ||
8610             Legal->isFixedOrderRecurrence(Phi)) &&
8611            "can only widen reductions and fixed-order recurrences here");
8612     VPValue *StartV = Operands[0];
8613     if (Legal->isReductionVariable(Phi)) {
8614       const RecurrenceDescriptor &RdxDesc =
8615           Legal->getReductionVars().find(Phi)->second;
8616       assert(RdxDesc.getRecurrenceStartValue() ==
8617              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8618       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8619                                            CM.isInLoopReduction(Phi),
8620                                            CM.useOrderedReductions(RdxDesc));
8621     } else {
8622       // TODO: Currently fixed-order recurrences are modeled as chains of
8623       // first-order recurrences. If there are no users of the intermediate
8624       // recurrences in the chain, the fixed order recurrence should be modeled
8625       // directly, enabling more efficient codegen.
8626       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8627     }
8628 
8629     // Record the incoming value from the backedge, so we can add the incoming
8630     // value from the backedge after all recipes have been created.
8631     auto *Inc = cast<Instruction>(
8632         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
8633     auto RecipeIter = Ingredient2Recipe.find(Inc);
8634     if (RecipeIter == Ingredient2Recipe.end())
8635       recordRecipeOf(Inc);
8636 
8637     PhisToFix.push_back(PhiRecipe);
8638     return toVPRecipeResult(PhiRecipe);
8639   }
8640 
8641   if (isa<TruncInst>(Instr) &&
8642       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8643                                                Range, *Plan)))
8644     return toVPRecipeResult(Recipe);
8645 
8646   // All widen recipes below deal only with VF > 1.
8647   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8648           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8649     return nullptr;
8650 
8651   if (auto *CI = dyn_cast<CallInst>(Instr))
8652     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8653 
8654   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8655     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8656 
8657   if (!shouldWiden(Instr, Range))
8658     return nullptr;
8659 
8660   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8661     return toVPRecipeResult(new VPWidenGEPRecipe(
8662         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8663 
8664   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8665     bool InvariantCond =
8666         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8667     return toVPRecipeResult(new VPWidenSelectRecipe(
8668         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8669   }
8670 
8671   return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
8672 }
8673 
8674 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8675                                                         ElementCount MaxVF) {
8676   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8677 
8678   // Add assume instructions we need to drop to DeadInstructions, to prevent
8679   // them from being added to the VPlan.
8680   // TODO: We only need to drop assumes in blocks that get flattend. If the
8681   // control flow is preserved, we should keep them.
8682   SmallPtrSet<Instruction *, 4> DeadInstructions;
8683   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8684   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8685 
8686   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8687   // Dead instructions do not need sinking. Remove them from SinkAfter.
8688   for (Instruction *I : DeadInstructions)
8689     SinkAfter.erase(I);
8690 
8691   // Cannot sink instructions after dead instructions (there won't be any
8692   // recipes for them). Instead, find the first non-dead previous instruction.
8693   for (auto &P : Legal->getSinkAfter()) {
8694     Instruction *SinkTarget = P.second;
8695     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8696     (void)FirstInst;
8697     while (DeadInstructions.contains(SinkTarget)) {
8698       assert(
8699           SinkTarget != FirstInst &&
8700           "Must find a live instruction (at least the one feeding the "
8701           "fixed-order recurrence PHI) before reaching beginning of the block");
8702       SinkTarget = SinkTarget->getPrevNode();
8703       assert(SinkTarget != P.first &&
8704              "sink source equals target, no sinking required");
8705     }
8706     P.second = SinkTarget;
8707   }
8708 
8709   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8710   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8711     VFRange SubRange = {VF, MaxVFPlusOne};
8712     VPlans.push_back(
8713         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8714     VF = SubRange.End;
8715   }
8716 }
8717 
8718 // Add the necessary canonical IV and branch recipes required to control the
8719 // loop.
8720 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8721                                   bool HasNUW,
8722                                   bool UseLaneMaskForLoopControlFlow) {
8723   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8724   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8725 
8726   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8727   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8728   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8729   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8730   Header->insert(CanonicalIVPHI, Header->begin());
8731 
8732   // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8733   // IV by VF * UF.
8734   auto *CanonicalIVIncrement =
8735       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8736                                : VPInstruction::CanonicalIVIncrement,
8737                         {CanonicalIVPHI}, DL, "index.next");
8738   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8739 
8740   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8741   EB->appendRecipe(CanonicalIVIncrement);
8742 
8743   if (UseLaneMaskForLoopControlFlow) {
8744     // Create the active lane mask instruction in the vplan preheader.
8745     VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
8746 
8747     // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
8748     // we have to take unrolling into account. Each part needs to start at
8749     //   Part * VF
8750     auto *CanonicalIVIncrementParts =
8751         new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8752                                  : VPInstruction::CanonicalIVIncrementForPart,
8753                           {StartV}, DL, "index.part.next");
8754     Preheader->appendRecipe(CanonicalIVIncrementParts);
8755 
8756     // Create the ActiveLaneMask instruction using the correct start values.
8757     VPValue *TC = Plan.getOrCreateTripCount();
8758     auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8759                                        {CanonicalIVIncrementParts, TC}, DL,
8760                                        "active.lane.mask.entry");
8761     Preheader->appendRecipe(EntryALM);
8762 
8763     // Now create the ActiveLaneMaskPhi recipe in the main loop using the
8764     // preheader ActiveLaneMask instruction.
8765     auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
8766     Header->insert(LaneMaskPhi, Header->getFirstNonPhi());
8767 
8768     // Create the active lane mask for the next iteration of the loop.
8769     CanonicalIVIncrementParts =
8770         new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8771                                  : VPInstruction::CanonicalIVIncrementForPart,
8772                           {CanonicalIVIncrement}, DL);
8773     EB->appendRecipe(CanonicalIVIncrementParts);
8774 
8775     auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8776                                   {CanonicalIVIncrementParts, TC}, DL,
8777                                   "active.lane.mask.next");
8778     EB->appendRecipe(ALM);
8779     LaneMaskPhi->addOperand(ALM);
8780 
8781     // We have to invert the mask here because a true condition means jumping
8782     // to the exit block.
8783     auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
8784     EB->appendRecipe(NotMask);
8785 
8786     VPInstruction *BranchBack =
8787         new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
8788     EB->appendRecipe(BranchBack);
8789   } else {
8790     // Add the BranchOnCount VPInstruction to the latch.
8791     VPInstruction *BranchBack = new VPInstruction(
8792         VPInstruction::BranchOnCount,
8793         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8794     EB->appendRecipe(BranchBack);
8795   }
8796 }
8797 
8798 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8799 // original exit block.
8800 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
8801                                 VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
8802                                 VPlan &Plan) {
8803   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8804   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8805   // Only handle single-exit loops with unique exit blocks for now.
8806   if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8807     return;
8808 
8809   // Introduce VPUsers modeling the exit values.
8810   for (PHINode &ExitPhi : ExitBB->phis()) {
8811     Value *IncomingValue =
8812         ExitPhi.getIncomingValueForBlock(ExitingBB);
8813     VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
8814     Plan.addLiveOut(&ExitPhi, V);
8815   }
8816 }
8817 
8818 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8819     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8820     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8821 
8822   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8823 
8824   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8825 
8826   // ---------------------------------------------------------------------------
8827   // Pre-construction: record ingredients whose recipes we'll need to further
8828   // process after constructing the initial VPlan.
8829   // ---------------------------------------------------------------------------
8830 
8831   // Mark instructions we'll need to sink later and their targets as
8832   // ingredients whose recipe we'll need to record.
8833   for (const auto &Entry : SinkAfter) {
8834     RecipeBuilder.recordRecipeOf(Entry.first);
8835     RecipeBuilder.recordRecipeOf(Entry.second);
8836   }
8837   for (const auto &Reduction : CM.getInLoopReductionChains()) {
8838     PHINode *Phi = Reduction.first;
8839     RecurKind Kind =
8840         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8841     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8842 
8843     RecipeBuilder.recordRecipeOf(Phi);
8844     for (const auto &R : ReductionOperations) {
8845       RecipeBuilder.recordRecipeOf(R);
8846       // For min/max reductions, where we have a pair of icmp/select, we also
8847       // need to record the ICmp recipe, so it can be removed later.
8848       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8849              "Only min/max recurrences allowed for inloop reductions");
8850       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8851         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8852     }
8853   }
8854 
8855   // For each interleave group which is relevant for this (possibly trimmed)
8856   // Range, add it to the set of groups to be later applied to the VPlan and add
8857   // placeholders for its members' Recipes which we'll be replacing with a
8858   // single VPInterleaveRecipe.
8859   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8860     auto applyIG = [IG, this](ElementCount VF) -> bool {
8861       return (VF.isVector() && // Query is illegal for VF == 1
8862               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8863                   LoopVectorizationCostModel::CM_Interleave);
8864     };
8865     if (!getDecisionAndClampRange(applyIG, Range))
8866       continue;
8867     InterleaveGroups.insert(IG);
8868     for (unsigned i = 0; i < IG->getFactor(); i++)
8869       if (Instruction *Member = IG->getMember(i))
8870         RecipeBuilder.recordRecipeOf(Member);
8871   };
8872 
8873   // ---------------------------------------------------------------------------
8874   // Build initial VPlan: Scan the body of the loop in a topological order to
8875   // visit each basic block after having visited its predecessor basic blocks.
8876   // ---------------------------------------------------------------------------
8877 
8878   // Create initial VPlan skeleton, starting with a block for the pre-header,
8879   // followed by a region for the vector loop, followed by the middle block. The
8880   // skeleton vector loop region contains a header and latch block.
8881   VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
8882   auto Plan = std::make_unique<VPlan>(Preheader);
8883 
8884   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8885   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8886   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8887   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8888   VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
8889   VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
8890   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
8891 
8892   Instruction *DLInst =
8893       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8894   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8895                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
8896                         !CM.foldTailByMasking(),
8897                         CM.useActiveLaneMaskForControlFlow());
8898 
8899   // Scan the body of the loop in a topological order to visit each basic block
8900   // after having visited its predecessor basic blocks.
8901   LoopBlocksDFS DFS(OrigLoop);
8902   DFS.perform(LI);
8903 
8904   VPBasicBlock *VPBB = HeaderVPBB;
8905   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8906   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8907     // Relevant instructions from basic block BB will be grouped into VPRecipe
8908     // ingredients and fill a new VPBasicBlock.
8909     unsigned VPBBsForBB = 0;
8910     if (VPBB != HeaderVPBB)
8911       VPBB->setName(BB->getName());
8912     Builder.setInsertPoint(VPBB);
8913 
8914     // Introduce each ingredient into VPlan.
8915     // TODO: Model and preserve debug intrinsics in VPlan.
8916     for (Instruction &I : BB->instructionsWithoutDebug()) {
8917       Instruction *Instr = &I;
8918 
8919       // First filter out irrelevant instructions, to ensure no recipes are
8920       // built for them.
8921       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8922         continue;
8923 
8924       SmallVector<VPValue *, 4> Operands;
8925       auto *Phi = dyn_cast<PHINode>(Instr);
8926       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8927         Operands.push_back(Plan->getOrAddVPValue(
8928             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8929       } else {
8930         auto OpRange = Plan->mapToVPValues(Instr->operands());
8931         Operands = {OpRange.begin(), OpRange.end()};
8932       }
8933 
8934       // Invariant stores inside loop will be deleted and a single store
8935       // with the final reduction value will be added to the exit block
8936       StoreInst *SI;
8937       if ((SI = dyn_cast<StoreInst>(&I)) &&
8938           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8939         continue;
8940 
8941       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8942               Instr, Operands, Range, VPBB, Plan)) {
8943         // If Instr can be simplified to an existing VPValue, use it.
8944         if (RecipeOrValue.is<VPValue *>()) {
8945           auto *VPV = RecipeOrValue.get<VPValue *>();
8946           Plan->addVPValue(Instr, VPV);
8947           // If the re-used value is a recipe, register the recipe for the
8948           // instruction, in case the recipe for Instr needs to be recorded.
8949           if (VPRecipeBase *R = VPV->getDefiningRecipe())
8950             RecipeBuilder.setRecipe(Instr, R);
8951           continue;
8952         }
8953         // Otherwise, add the new recipe.
8954         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8955         for (auto *Def : Recipe->definedValues()) {
8956           auto *UV = Def->getUnderlyingValue();
8957           Plan->addVPValue(UV, Def);
8958         }
8959 
8960         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8961             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8962           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8963           // of the header block. That can happen for truncates of induction
8964           // variables. Those recipes are moved to the phi section of the header
8965           // block after applying SinkAfter, which relies on the original
8966           // position of the trunc.
8967           assert(isa<TruncInst>(Instr));
8968           InductionsToMove.push_back(
8969               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8970         }
8971         RecipeBuilder.setRecipe(Instr, Recipe);
8972         VPBB->appendRecipe(Recipe);
8973         continue;
8974       }
8975 
8976       // Otherwise, if all widening options failed, Instruction is to be
8977       // replicated. This may create a successor for VPBB.
8978       VPBasicBlock *NextVPBB =
8979           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8980       if (NextVPBB != VPBB) {
8981         VPBB = NextVPBB;
8982         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8983                                     : "");
8984       }
8985     }
8986 
8987     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8988     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8989   }
8990 
8991   // After here, VPBB should not be used.
8992   VPBB = nullptr;
8993 
8994   addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
8995 
8996   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8997          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8998          "entry block must be set to a VPRegionBlock having a non-empty entry "
8999          "VPBasicBlock");
9000   RecipeBuilder.fixHeaderPhis();
9001 
9002   // ---------------------------------------------------------------------------
9003   // Transform initial VPlan: Apply previously taken decisions, in order, to
9004   // bring the VPlan to its final state.
9005   // ---------------------------------------------------------------------------
9006 
9007   // Apply Sink-After legal constraints.
9008   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9009     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9010     if (Region && Region->isReplicator()) {
9011       assert(Region->getNumSuccessors() == 1 &&
9012              Region->getNumPredecessors() == 1 && "Expected SESE region!");
9013       assert(R->getParent()->size() == 1 &&
9014              "A recipe in an original replicator region must be the only "
9015              "recipe in its block");
9016       return Region;
9017     }
9018     return nullptr;
9019   };
9020   for (const auto &Entry : SinkAfter) {
9021     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9022     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9023 
9024     auto *TargetRegion = GetReplicateRegion(Target);
9025     auto *SinkRegion = GetReplicateRegion(Sink);
9026     if (!SinkRegion) {
9027       // If the sink source is not a replicate region, sink the recipe directly.
9028       if (TargetRegion) {
9029         // The target is in a replication region, make sure to move Sink to
9030         // the block after it, not into the replication region itself.
9031         VPBasicBlock *NextBlock =
9032             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9033         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9034       } else
9035         Sink->moveAfter(Target);
9036       continue;
9037     }
9038 
9039     // The sink source is in a replicate region. Unhook the region from the CFG.
9040     auto *SinkPred = SinkRegion->getSinglePredecessor();
9041     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9042     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9043     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9044     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9045 
9046     if (TargetRegion) {
9047       // The target recipe is also in a replicate region, move the sink region
9048       // after the target region.
9049       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9050       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9051       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9052       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9053     } else {
9054       // The sink source is in a replicate region, we need to move the whole
9055       // replicate region, which should only contain a single recipe in the
9056       // main block.
9057       auto *SplitBlock =
9058           Target->getParent()->splitAt(std::next(Target->getIterator()));
9059 
9060       auto *SplitPred = SplitBlock->getSinglePredecessor();
9061 
9062       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9063       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9064       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9065     }
9066   }
9067 
9068   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
9069   VPlanTransforms::removeRedundantInductionCasts(*Plan);
9070 
9071   // Now that sink-after is done, move induction recipes for optimized truncates
9072   // to the phi section of the header block.
9073   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9074     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9075 
9076   // Adjust the recipes for any inloop reductions.
9077   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
9078                              RecipeBuilder, Range.Start);
9079 
9080   // Introduce a recipe to combine the incoming and previous values of a
9081   // fixed-order recurrence.
9082   for (VPRecipeBase &R :
9083        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9084     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9085     if (!RecurPhi)
9086       continue;
9087 
9088     VPRecipeBase *PrevRecipe = &RecurPhi->getBackedgeRecipe();
9089     // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
9090     // to terminate.
9091     while (auto *PrevPhi =
9092                dyn_cast<VPFirstOrderRecurrencePHIRecipe>(PrevRecipe))
9093       PrevRecipe = &PrevPhi->getBackedgeRecipe();
9094     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9095     auto *Region = GetReplicateRegion(PrevRecipe);
9096     if (Region)
9097       InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor());
9098     if (!InsertBlock) {
9099       InsertBlock = new VPBasicBlock(Region->getName() + ".succ");
9100       VPBlockUtils::insertBlockAfter(InsertBlock, Region);
9101     }
9102     if (Region || PrevRecipe->isPhi())
9103       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9104     else
9105       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9106 
9107     auto *RecurSplice = cast<VPInstruction>(
9108         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9109                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9110 
9111     RecurPhi->replaceAllUsesWith(RecurSplice);
9112     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9113     // all users.
9114     RecurSplice->setOperand(0, RecurPhi);
9115   }
9116 
9117   // Interleave memory: for each Interleave Group we marked earlier as relevant
9118   // for this VPlan, replace the Recipes widening its memory instructions with a
9119   // single VPInterleaveRecipe at its insertion point.
9120   for (const auto *IG : InterleaveGroups) {
9121     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9122         RecipeBuilder.getRecipe(IG->getInsertPos()));
9123     SmallVector<VPValue *, 4> StoredValues;
9124     for (unsigned i = 0; i < IG->getFactor(); ++i)
9125       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9126         auto *StoreR =
9127             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9128         StoredValues.push_back(StoreR->getStoredValue());
9129       }
9130 
9131     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9132                                         Recipe->getMask());
9133     VPIG->insertBefore(Recipe);
9134     unsigned J = 0;
9135     for (unsigned i = 0; i < IG->getFactor(); ++i)
9136       if (Instruction *Member = IG->getMember(i)) {
9137         if (!Member->getType()->isVoidTy()) {
9138           VPValue *OriginalV = Plan->getVPValue(Member);
9139           Plan->removeVPValueFor(Member);
9140           Plan->addVPValue(Member, VPIG->getVPValue(J));
9141           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9142           J++;
9143         }
9144         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9145       }
9146   }
9147 
9148   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9149        VF *= 2)
9150     Plan->addVF(VF);
9151   Plan->setName("Initial VPlan");
9152 
9153   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9154   // in ways that accessing values using original IR values is incorrect.
9155   Plan->disableValue2VPValue();
9156 
9157   VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
9158   VPlanTransforms::removeDeadRecipes(*Plan);
9159 
9160   bool ShouldSimplify = true;
9161   while (ShouldSimplify) {
9162     ShouldSimplify = VPlanTransforms::sinkScalarOperands(*Plan);
9163     ShouldSimplify |=
9164         VPlanTransforms::mergeReplicateRegionsIntoSuccessors(*Plan);
9165     ShouldSimplify |= VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
9166   }
9167 
9168   VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
9169   VPlanTransforms::mergeBlocksIntoPredecessors(*Plan);
9170 
9171   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9172   return Plan;
9173 }
9174 
9175 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9176   // Outer loop handling: They may require CFG and instruction level
9177   // transformations before even evaluating whether vectorization is profitable.
9178   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9179   // the vectorization pipeline.
9180   assert(!OrigLoop->isInnermost());
9181   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9182 
9183   // Create new empty VPlan
9184   auto Plan = std::make_unique<VPlan>();
9185 
9186   // Build hierarchical CFG
9187   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9188   HCFGBuilder.buildHierarchicalCFG();
9189 
9190   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9191        VF *= 2)
9192     Plan->addVF(VF);
9193 
9194   SmallPtrSet<Instruction *, 1> DeadInstructions;
9195   VPlanTransforms::VPInstructionsToVPRecipes(
9196       OrigLoop, Plan,
9197       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9198       DeadInstructions, *PSE.getSE(), *TLI);
9199 
9200   // Remove the existing terminator of the exiting block of the top-most region.
9201   // A BranchOnCount will be added instead when adding the canonical IV recipes.
9202   auto *Term =
9203       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9204   Term->eraseFromParent();
9205 
9206   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9207                         true, CM.useActiveLaneMaskForControlFlow());
9208   return Plan;
9209 }
9210 
9211 // Adjust the recipes for reductions. For in-loop reductions the chain of
9212 // instructions leading from the loop exit instr to the phi need to be converted
9213 // to reductions, with one operand being vector and the other being the scalar
9214 // reduction chain. For other reductions, a select is introduced between the phi
9215 // and live-out recipes when folding the tail.
9216 void LoopVectorizationPlanner::adjustRecipesForReductions(
9217     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9218     ElementCount MinVF) {
9219   for (const auto &Reduction : CM.getInLoopReductionChains()) {
9220     PHINode *Phi = Reduction.first;
9221     const RecurrenceDescriptor &RdxDesc =
9222         Legal->getReductionVars().find(Phi)->second;
9223     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9224 
9225     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9226       continue;
9227 
9228     // ReductionOperations are orders top-down from the phi's use to the
9229     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9230     // which of the two operands will remain scalar and which will be reduced.
9231     // For minmax the chain will be the select instructions.
9232     Instruction *Chain = Phi;
9233     for (Instruction *R : ReductionOperations) {
9234       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9235       RecurKind Kind = RdxDesc.getRecurrenceKind();
9236 
9237       VPValue *ChainOp = Plan->getVPValue(Chain);
9238       unsigned FirstOpId;
9239       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9240              "Only min/max recurrences allowed for inloop reductions");
9241       // Recognize a call to the llvm.fmuladd intrinsic.
9242       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9243       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9244              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9245       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9246         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9247                "Expected to replace a VPWidenSelectSC");
9248         FirstOpId = 1;
9249       } else {
9250         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9251                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9252                "Expected to replace a VPWidenSC");
9253         FirstOpId = 0;
9254       }
9255       unsigned VecOpId =
9256           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9257       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9258 
9259       VPValue *CondOp = nullptr;
9260       if (CM.blockNeedsPredicationForAnyReason(R->getParent())) {
9261         VPBuilder::InsertPointGuard Guard(Builder);
9262         Builder.setInsertPoint(WidenRecipe->getParent(),
9263                                WidenRecipe->getIterator());
9264         CondOp = RecipeBuilder.createBlockInMask(R->getParent(), Plan);
9265       }
9266 
9267       if (IsFMulAdd) {
9268         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9269         // need to create an fmul recipe to use as the vector operand for the
9270         // fadd reduction.
9271         VPInstruction *FMulRecipe = new VPInstruction(
9272             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9273         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9274         WidenRecipe->getParent()->insert(FMulRecipe,
9275                                          WidenRecipe->getIterator());
9276         VecOp = FMulRecipe;
9277       }
9278       VPReductionRecipe *RedRecipe =
9279           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9280       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9281       Plan->removeVPValueFor(R);
9282       Plan->addVPValue(R, RedRecipe);
9283       // Append the recipe to the end of the VPBasicBlock because we need to
9284       // ensure that it comes after all of it's inputs, including CondOp.
9285       WidenRecipe->getParent()->appendRecipe(RedRecipe);
9286       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9287       WidenRecipe->eraseFromParent();
9288 
9289       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9290         VPRecipeBase *CompareRecipe =
9291             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9292         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9293                "Expected to replace a VPWidenSC");
9294         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9295                "Expected no remaining users");
9296         CompareRecipe->eraseFromParent();
9297       }
9298       Chain = R;
9299     }
9300   }
9301 
9302   // If tail is folded by masking, introduce selects between the phi
9303   // and the live-out instruction of each reduction, at the beginning of the
9304   // dedicated latch block.
9305   if (CM.foldTailByMasking()) {
9306     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9307     for (VPRecipeBase &R :
9308          Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9309       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9310       if (!PhiR || PhiR->isInLoop())
9311         continue;
9312       VPValue *Cond =
9313           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9314       VPValue *Red = PhiR->getBackedgeValue();
9315       assert(Red->getDefiningRecipe()->getParent() != LatchVPBB &&
9316              "reduction recipe must be defined before latch");
9317       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9318     }
9319   }
9320 }
9321 
9322 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9323 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9324                                VPSlotTracker &SlotTracker) const {
9325   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9326   IG->getInsertPos()->printAsOperand(O, false);
9327   O << ", ";
9328   getAddr()->printAsOperand(O, SlotTracker);
9329   VPValue *Mask = getMask();
9330   if (Mask) {
9331     O << ", ";
9332     Mask->printAsOperand(O, SlotTracker);
9333   }
9334 
9335   unsigned OpIdx = 0;
9336   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9337     if (!IG->getMember(i))
9338       continue;
9339     if (getNumStoreOperands() > 0) {
9340       O << "\n" << Indent << "  store ";
9341       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9342       O << " to index " << i;
9343     } else {
9344       O << "\n" << Indent << "  ";
9345       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9346       O << " = load from index " << i;
9347     }
9348     ++OpIdx;
9349   }
9350 }
9351 #endif
9352 
9353 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9354   assert(!State.Instance && "Int or FP induction being replicated.");
9355 
9356   Value *Start = getStartValue()->getLiveInIRValue();
9357   const InductionDescriptor &ID = getInductionDescriptor();
9358   TruncInst *Trunc = getTruncInst();
9359   IRBuilderBase &Builder = State.Builder;
9360   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
9361   assert(State.VF.isVector() && "must have vector VF");
9362 
9363   // The value from the original loop to which we are mapping the new induction
9364   // variable.
9365   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9366 
9367   // Fast-math-flags propagate from the original induction instruction.
9368   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9369   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9370     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9371 
9372   // Now do the actual transformations, and start with fetching the step value.
9373   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9374 
9375   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
9376          "Expected either an induction phi-node or a truncate of it!");
9377 
9378   // Construct the initial value of the vector IV in the vector loop preheader
9379   auto CurrIP = Builder.saveIP();
9380   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9381   Builder.SetInsertPoint(VectorPH->getTerminator());
9382   if (isa<TruncInst>(EntryVal)) {
9383     assert(Start->getType()->isIntegerTy() &&
9384            "Truncation requires an integer type");
9385     auto *TruncType = cast<IntegerType>(EntryVal->getType());
9386     Step = Builder.CreateTrunc(Step, TruncType);
9387     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9388   }
9389 
9390   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9391   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9392   Value *SteppedStart = getStepVector(
9393       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9394 
9395   // We create vector phi nodes for both integer and floating-point induction
9396   // variables. Here, we determine the kind of arithmetic we will perform.
9397   Instruction::BinaryOps AddOp;
9398   Instruction::BinaryOps MulOp;
9399   if (Step->getType()->isIntegerTy()) {
9400     AddOp = Instruction::Add;
9401     MulOp = Instruction::Mul;
9402   } else {
9403     AddOp = ID.getInductionOpcode();
9404     MulOp = Instruction::FMul;
9405   }
9406 
9407   // Multiply the vectorization factor by the step using integer or
9408   // floating-point arithmetic as appropriate.
9409   Type *StepType = Step->getType();
9410   Value *RuntimeVF;
9411   if (Step->getType()->isFloatingPointTy())
9412     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9413   else
9414     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9415   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9416 
9417   // Create a vector splat to use in the induction update.
9418   //
9419   // FIXME: If the step is non-constant, we create the vector splat with
9420   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9421   //        handle a constant vector splat.
9422   Value *SplatVF = isa<Constant>(Mul)
9423                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9424                        : Builder.CreateVectorSplat(State.VF, Mul);
9425   Builder.restoreIP(CurrIP);
9426 
9427   // We may need to add the step a number of times, depending on the unroll
9428   // factor. The last of those goes into the PHI.
9429   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9430                                     &*State.CFG.PrevBB->getFirstInsertionPt());
9431   VecInd->setDebugLoc(EntryVal->getDebugLoc());
9432   Instruction *LastInduction = VecInd;
9433   for (unsigned Part = 0; Part < State.UF; ++Part) {
9434     State.set(this, LastInduction, Part);
9435 
9436     if (isa<TruncInst>(EntryVal))
9437       State.addMetadata(LastInduction, EntryVal);
9438 
9439     LastInduction = cast<Instruction>(
9440         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9441     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9442   }
9443 
9444   LastInduction->setName("vec.ind.next");
9445   VecInd->addIncoming(SteppedStart, VectorPH);
9446   // Add induction update using an incorrect block temporarily. The phi node
9447   // will be fixed after VPlan execution. Note that at this point the latch
9448   // block cannot be used, as it does not exist yet.
9449   // TODO: Model increment value in VPlan, by turning the recipe into a
9450   // multi-def and a subclass of VPHeaderPHIRecipe.
9451   VecInd->addIncoming(LastInduction, VectorPH);
9452 }
9453 
9454 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9455   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9456          "Not a pointer induction according to InductionDescriptor!");
9457   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9458          "Unexpected type.");
9459 
9460   auto *IVR = getParent()->getPlan()->getCanonicalIV();
9461   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9462 
9463   if (onlyScalarsGenerated(State.VF)) {
9464     // This is the normalized GEP that starts counting at zero.
9465     Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9466         CanonicalIV, IndDesc.getStep()->getType());
9467     // Determine the number of scalars we need to generate for each unroll
9468     // iteration. If the instruction is uniform, we only need to generate the
9469     // first lane. Otherwise, we generate all VF values.
9470     bool IsUniform = vputils::onlyFirstLaneUsed(this);
9471     assert((IsUniform || !State.VF.isScalable()) &&
9472            "Cannot scalarize a scalable VF");
9473     unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9474 
9475     for (unsigned Part = 0; Part < State.UF; ++Part) {
9476       Value *PartStart =
9477           createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9478 
9479       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9480         Value *Idx = State.Builder.CreateAdd(
9481             PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9482         Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9483 
9484         Value *Step = State.get(getOperand(1), VPIteration(0, Part));
9485         Value *SclrGep = emitTransformedIndex(
9486             State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
9487         SclrGep->setName("next.gep");
9488         State.set(this, SclrGep, VPIteration(Part, Lane));
9489       }
9490     }
9491     return;
9492   }
9493 
9494   assert(isa<SCEVConstant>(IndDesc.getStep()) &&
9495          "Induction step not a SCEV constant!");
9496   Type *PhiType = IndDesc.getStep()->getType();
9497 
9498   // Build a pointer phi
9499   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9500   Type *ScStValueType = ScalarStartValue->getType();
9501   PHINode *NewPointerPhi =
9502       PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9503 
9504   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9505   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9506 
9507   // A pointer induction, performed by using a gep
9508   Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9509 
9510   Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
9511   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9512   Value *NumUnrolledElems =
9513       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9514   Value *InductionGEP = GetElementPtrInst::Create(
9515       IndDesc.getElementType(), NewPointerPhi,
9516       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9517       InductionLoc);
9518   // Add induction update using an incorrect block temporarily. The phi node
9519   // will be fixed after VPlan execution. Note that at this point the latch
9520   // block cannot be used, as it does not exist yet.
9521   // TODO: Model increment value in VPlan, by turning the recipe into a
9522   // multi-def and a subclass of VPHeaderPHIRecipe.
9523   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9524 
9525   // Create UF many actual address geps that use the pointer
9526   // phi as base and a vectorized version of the step value
9527   // (<step*0, ..., step*N>) as offset.
9528   for (unsigned Part = 0; Part < State.UF; ++Part) {
9529     Type *VecPhiType = VectorType::get(PhiType, State.VF);
9530     Value *StartOffsetScalar =
9531         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9532     Value *StartOffset =
9533         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9534     // Create a vector of consecutive numbers from zero to VF.
9535     StartOffset = State.Builder.CreateAdd(
9536         StartOffset, State.Builder.CreateStepVector(VecPhiType));
9537 
9538     assert(ScalarStepValue == State.get(getOperand(1), VPIteration(0, Part)) &&
9539            "scalar step must be the same across all parts");
9540     Value *GEP = State.Builder.CreateGEP(
9541         IndDesc.getElementType(), NewPointerPhi,
9542         State.Builder.CreateMul(
9543             StartOffset,
9544             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9545             "vector.gep"));
9546     State.set(this, GEP, Part);
9547   }
9548 }
9549 
9550 void VPDerivedIVRecipe::execute(VPTransformState &State) {
9551   assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9552 
9553   // Fast-math-flags propagate from the original induction instruction.
9554   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9555   if (IndDesc.getInductionBinOp() &&
9556       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9557     State.Builder.setFastMathFlags(
9558         IndDesc.getInductionBinOp()->getFastMathFlags());
9559 
9560   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9561   Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9562   Value *DerivedIV =
9563       emitTransformedIndex(State.Builder, CanonicalIV,
9564                            getStartValue()->getLiveInIRValue(), Step, IndDesc);
9565   DerivedIV->setName("offset.idx");
9566   if (ResultTy != DerivedIV->getType()) {
9567     assert(Step->getType()->isIntegerTy() &&
9568            "Truncation requires an integer step");
9569     DerivedIV = State.Builder.CreateTrunc(DerivedIV, ResultTy);
9570   }
9571   assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9572 
9573   State.set(this, DerivedIV, VPIteration(0, 0));
9574 }
9575 
9576 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9577   // Fast-math-flags propagate from the original induction instruction.
9578   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9579   if (IndDesc.getInductionBinOp() &&
9580       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9581     State.Builder.setFastMathFlags(
9582         IndDesc.getInductionBinOp()->getFastMathFlags());
9583 
9584   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
9585   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9586 
9587   buildScalarSteps(BaseIV, Step, IndDesc, this, State);
9588 }
9589 
9590 void VPInterleaveRecipe::execute(VPTransformState &State) {
9591   assert(!State.Instance && "Interleave group being replicated.");
9592   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9593                                       getStoredValues(), getMask());
9594 }
9595 
9596 void VPReductionRecipe::execute(VPTransformState &State) {
9597   assert(!State.Instance && "Reduction being replicated.");
9598   Value *PrevInChain = State.get(getChainOp(), 0);
9599   RecurKind Kind = RdxDesc->getRecurrenceKind();
9600   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9601   // Propagate the fast-math flags carried by the underlying instruction.
9602   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9603   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9604   for (unsigned Part = 0; Part < State.UF; ++Part) {
9605     Value *NewVecOp = State.get(getVecOp(), Part);
9606     if (VPValue *Cond = getCondOp()) {
9607       Value *NewCond = State.get(Cond, Part);
9608       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9609       Value *Iden = RdxDesc->getRecurrenceIdentity(
9610           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9611       Value *IdenVec =
9612           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9613       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9614       NewVecOp = Select;
9615     }
9616     Value *NewRed;
9617     Value *NextInChain;
9618     if (IsOrdered) {
9619       if (State.VF.isVector())
9620         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9621                                         PrevInChain);
9622       else
9623         NewRed = State.Builder.CreateBinOp(
9624             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9625             NewVecOp);
9626       PrevInChain = NewRed;
9627     } else {
9628       PrevInChain = State.get(getChainOp(), Part);
9629       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9630     }
9631     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9632       NextInChain =
9633           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9634                          NewRed, PrevInChain);
9635     } else if (IsOrdered)
9636       NextInChain = NewRed;
9637     else
9638       NextInChain = State.Builder.CreateBinOp(
9639           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9640           PrevInChain);
9641     State.set(this, NextInChain, Part);
9642   }
9643 }
9644 
9645 void VPReplicateRecipe::execute(VPTransformState &State) {
9646   Instruction *UI = getUnderlyingInstr();
9647   if (State.Instance) { // Generate a single instance.
9648     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9649     State.ILV->scalarizeInstruction(UI, this, *State.Instance,
9650                                     IsPredicated, State);
9651     // Insert scalar instance packing it into a vector.
9652     if (AlsoPack && State.VF.isVector()) {
9653       // If we're constructing lane 0, initialize to start from poison.
9654       if (State.Instance->Lane.isFirstLane()) {
9655         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9656         Value *Poison = PoisonValue::get(
9657             VectorType::get(UI->getType(), State.VF));
9658         State.set(this, Poison, State.Instance->Part);
9659       }
9660       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9661     }
9662     return;
9663   }
9664 
9665   if (IsUniform) {
9666     // If the recipe is uniform across all parts (instead of just per VF), only
9667     // generate a single instance.
9668     if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
9669         all_of(operands(), [](VPValue *Op) {
9670           return Op->isDefinedOutsideVectorRegions();
9671         })) {
9672       State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), IsPredicated,
9673                                       State);
9674       if (user_begin() != user_end()) {
9675         for (unsigned Part = 1; Part < State.UF; ++Part)
9676           State.set(this, State.get(this, VPIteration(0, 0)),
9677                     VPIteration(Part, 0));
9678       }
9679       return;
9680     }
9681 
9682     // Uniform within VL means we need to generate lane 0 only for each
9683     // unrolled copy.
9684     for (unsigned Part = 0; Part < State.UF; ++Part)
9685       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0),
9686                                       IsPredicated, State);
9687     return;
9688   }
9689 
9690   // A store of a loop varying value to a loop invariant address only
9691   // needs only the last copy of the store.
9692   if (isa<StoreInst>(UI) && !getOperand(1)->hasDefiningRecipe()) {
9693     auto Lane = VPLane::getLastLaneForVF(State.VF);
9694     State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane), IsPredicated,
9695                                     State);
9696     return;
9697   }
9698 
9699   // Generate scalar instances for all VF lanes of all UF parts.
9700   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9701   const unsigned EndLane = State.VF.getKnownMinValue();
9702   for (unsigned Part = 0; Part < State.UF; ++Part)
9703     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9704       State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane),
9705                                       IsPredicated, State);
9706 }
9707 
9708 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9709   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9710 
9711   // Attempt to issue a wide load.
9712   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9713   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9714 
9715   assert((LI || SI) && "Invalid Load/Store instruction");
9716   assert((!SI || StoredValue) && "No stored value provided for widened store");
9717   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9718 
9719   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9720 
9721   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9722   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9723   bool CreateGatherScatter = !Consecutive;
9724 
9725   auto &Builder = State.Builder;
9726   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9727   bool isMaskRequired = getMask();
9728   if (isMaskRequired)
9729     for (unsigned Part = 0; Part < State.UF; ++Part)
9730       BlockInMaskParts[Part] = State.get(getMask(), Part);
9731 
9732   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9733     // Calculate the pointer for the specific unroll-part.
9734     GetElementPtrInst *PartPtr = nullptr;
9735 
9736     bool InBounds = false;
9737     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9738       InBounds = gep->isInBounds();
9739     if (Reverse) {
9740       // If the address is consecutive but reversed, then the
9741       // wide store needs to start at the last vector element.
9742       // RunTimeVF =  VScale * VF.getKnownMinValue()
9743       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9744       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9745       // NumElt = -Part * RunTimeVF
9746       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9747       // LastLane = 1 - RunTimeVF
9748       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9749       PartPtr =
9750           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9751       PartPtr->setIsInBounds(InBounds);
9752       PartPtr = cast<GetElementPtrInst>(
9753           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9754       PartPtr->setIsInBounds(InBounds);
9755       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9756         BlockInMaskParts[Part] =
9757             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9758     } else {
9759       Value *Increment =
9760           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9761       PartPtr = cast<GetElementPtrInst>(
9762           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9763       PartPtr->setIsInBounds(InBounds);
9764     }
9765 
9766     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9767     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9768   };
9769 
9770   // Handle Stores:
9771   if (SI) {
9772     State.setDebugLocFromInst(SI);
9773 
9774     for (unsigned Part = 0; Part < State.UF; ++Part) {
9775       Instruction *NewSI = nullptr;
9776       Value *StoredVal = State.get(StoredValue, Part);
9777       if (CreateGatherScatter) {
9778         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9779         Value *VectorGep = State.get(getAddr(), Part);
9780         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9781                                             MaskPart);
9782       } else {
9783         if (Reverse) {
9784           // If we store to reverse consecutive memory locations, then we need
9785           // to reverse the order of elements in the stored value.
9786           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9787           // We don't want to update the value in the map as it might be used in
9788           // another expression. So don't call resetVectorValue(StoredVal).
9789         }
9790         auto *VecPtr =
9791             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9792         if (isMaskRequired)
9793           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9794                                             BlockInMaskParts[Part]);
9795         else
9796           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9797       }
9798       State.addMetadata(NewSI, SI);
9799     }
9800     return;
9801   }
9802 
9803   // Handle loads.
9804   assert(LI && "Must have a load instruction");
9805   State.setDebugLocFromInst(LI);
9806   for (unsigned Part = 0; Part < State.UF; ++Part) {
9807     Value *NewLI;
9808     if (CreateGatherScatter) {
9809       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9810       Value *VectorGep = State.get(getAddr(), Part);
9811       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9812                                          nullptr, "wide.masked.gather");
9813       State.addMetadata(NewLI, LI);
9814     } else {
9815       auto *VecPtr =
9816           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9817       if (isMaskRequired)
9818         NewLI = Builder.CreateMaskedLoad(
9819             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9820             PoisonValue::get(DataTy), "wide.masked.load");
9821       else
9822         NewLI =
9823             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9824 
9825       // Add metadata to the load, but setVectorValue to the reverse shuffle.
9826       State.addMetadata(NewLI, LI);
9827       if (Reverse)
9828         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9829     }
9830 
9831     State.set(getVPSingleValue(), NewLI, Part);
9832   }
9833 }
9834 
9835 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9836 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9837 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9838 // for predication.
9839 static ScalarEpilogueLowering getScalarEpilogueLowering(
9840     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9841     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9842     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9843     LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9844   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9845   // don't look at hints or options, and don't request a scalar epilogue.
9846   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9847   // LoopAccessInfo (due to code dependency and not being able to reliably get
9848   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9849   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9850   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9851   // back to the old way and vectorize with versioning when forced. See D81345.)
9852   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9853                                                       PGSOQueryType::IRPass) &&
9854                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9855     return CM_ScalarEpilogueNotAllowedOptSize;
9856 
9857   // 2) If set, obey the directives
9858   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9859     switch (PreferPredicateOverEpilogue) {
9860     case PreferPredicateTy::ScalarEpilogue:
9861       return CM_ScalarEpilogueAllowed;
9862     case PreferPredicateTy::PredicateElseScalarEpilogue:
9863       return CM_ScalarEpilogueNotNeededUsePredicate;
9864     case PreferPredicateTy::PredicateOrDontVectorize:
9865       return CM_ScalarEpilogueNotAllowedUsePredicate;
9866     };
9867   }
9868 
9869   // 3) If set, obey the hints
9870   switch (Hints.getPredicate()) {
9871   case LoopVectorizeHints::FK_Enabled:
9872     return CM_ScalarEpilogueNotNeededUsePredicate;
9873   case LoopVectorizeHints::FK_Disabled:
9874     return CM_ScalarEpilogueAllowed;
9875   };
9876 
9877   // 4) if the TTI hook indicates this is profitable, request predication.
9878   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI))
9879     return CM_ScalarEpilogueNotNeededUsePredicate;
9880 
9881   return CM_ScalarEpilogueAllowed;
9882 }
9883 
9884 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9885   // If Values have been set for this Def return the one relevant for \p Part.
9886   if (hasVectorValue(Def, Part))
9887     return Data.PerPartOutput[Def][Part];
9888 
9889   if (!hasScalarValue(Def, {Part, 0})) {
9890     Value *IRV = Def->getLiveInIRValue();
9891     Value *B = ILV->getBroadcastInstrs(IRV);
9892     set(Def, B, Part);
9893     return B;
9894   }
9895 
9896   Value *ScalarValue = get(Def, {Part, 0});
9897   // If we aren't vectorizing, we can just copy the scalar map values over
9898   // to the vector map.
9899   if (VF.isScalar()) {
9900     set(Def, ScalarValue, Part);
9901     return ScalarValue;
9902   }
9903 
9904   bool IsUniform = vputils::isUniformAfterVectorization(Def);
9905 
9906   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9907   // Check if there is a scalar value for the selected lane.
9908   if (!hasScalarValue(Def, {Part, LastLane})) {
9909     // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform.
9910     assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
9911             isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) &&
9912            "unexpected recipe found to be invariant");
9913     IsUniform = true;
9914     LastLane = 0;
9915   }
9916 
9917   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9918   // Set the insert point after the last scalarized instruction or after the
9919   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
9920   // will directly follow the scalar definitions.
9921   auto OldIP = Builder.saveIP();
9922   auto NewIP =
9923       isa<PHINode>(LastInst)
9924           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
9925           : std::next(BasicBlock::iterator(LastInst));
9926   Builder.SetInsertPoint(&*NewIP);
9927 
9928   // However, if we are vectorizing, we need to construct the vector values.
9929   // If the value is known to be uniform after vectorization, we can just
9930   // broadcast the scalar value corresponding to lane zero for each unroll
9931   // iteration. Otherwise, we construct the vector values using
9932   // insertelement instructions. Since the resulting vectors are stored in
9933   // State, we will only generate the insertelements once.
9934   Value *VectorValue = nullptr;
9935   if (IsUniform) {
9936     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9937     set(Def, VectorValue, Part);
9938   } else {
9939     // Initialize packing with insertelements to start from undef.
9940     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
9941     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9942     set(Def, Undef, Part);
9943     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9944       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9945     VectorValue = get(Def, Part);
9946   }
9947   Builder.restoreIP(OldIP);
9948   return VectorValue;
9949 }
9950 
9951 // Process the loop in the VPlan-native vectorization path. This path builds
9952 // VPlan upfront in the vectorization pipeline, which allows to apply
9953 // VPlan-to-VPlan transformations from the very beginning without modifying the
9954 // input LLVM IR.
9955 static bool processLoopInVPlanNativePath(
9956     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9957     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9958     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9959     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9960     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9961     LoopVectorizationRequirements &Requirements) {
9962 
9963   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9964     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9965     return false;
9966   }
9967   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9968   Function *F = L->getHeader()->getParent();
9969   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9970 
9971   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9972       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI);
9973 
9974   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9975                                 &Hints, IAI);
9976   // Use the planner for outer loop vectorization.
9977   // TODO: CM is not used at this point inside the planner. Turn CM into an
9978   // optional argument if we don't need it in the future.
9979   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);
9980 
9981   // Get user vectorization factor.
9982   ElementCount UserVF = Hints.getWidth();
9983 
9984   CM.collectElementTypesForWidening();
9985 
9986   // Plan how to best vectorize, return the best VF and its cost.
9987   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9988 
9989   // If we are stress testing VPlan builds, do not attempt to generate vector
9990   // code. Masked vector code generation support will follow soon.
9991   // Also, do not attempt to vectorize if no vector code will be produced.
9992   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9993     return false;
9994 
9995   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9996 
9997   {
9998     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9999                              F->getParent()->getDataLayout());
10000     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10001                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
10002     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10003                       << L->getHeader()->getParent()->getName() << "\"\n");
10004     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
10005   }
10006 
10007   // Mark the loop as already vectorized to avoid vectorizing again.
10008   Hints.setAlreadyVectorized();
10009   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10010   return true;
10011 }
10012 
10013 // Emit a remark if there are stores to floats that required a floating point
10014 // extension. If the vectorized loop was generated with floating point there
10015 // will be a performance penalty from the conversion overhead and the change in
10016 // the vector width.
10017 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10018   SmallVector<Instruction *, 4> Worklist;
10019   for (BasicBlock *BB : L->getBlocks()) {
10020     for (Instruction &Inst : *BB) {
10021       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10022         if (S->getValueOperand()->getType()->isFloatTy())
10023           Worklist.push_back(S);
10024       }
10025     }
10026   }
10027 
10028   // Traverse the floating point stores upwards searching, for floating point
10029   // conversions.
10030   SmallPtrSet<const Instruction *, 4> Visited;
10031   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10032   while (!Worklist.empty()) {
10033     auto *I = Worklist.pop_back_val();
10034     if (!L->contains(I))
10035       continue;
10036     if (!Visited.insert(I).second)
10037       continue;
10038 
10039     // Emit a remark if the floating point store required a floating
10040     // point conversion.
10041     // TODO: More work could be done to identify the root cause such as a
10042     // constant or a function return type and point the user to it.
10043     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10044       ORE->emit([&]() {
10045         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10046                                           I->getDebugLoc(), L->getHeader())
10047                << "floating point conversion changes vector width. "
10048                << "Mixed floating point precision requires an up/down "
10049                << "cast that will negatively impact performance.";
10050       });
10051 
10052     for (Use &Op : I->operands())
10053       if (auto *OpI = dyn_cast<Instruction>(Op))
10054         Worklist.push_back(OpI);
10055   }
10056 }
10057 
10058 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10059                                        VectorizationFactor &VF,
10060                                        std::optional<unsigned> VScale, Loop *L,
10061                                        ScalarEvolution &SE) {
10062   InstructionCost CheckCost = Checks.getCost();
10063   if (!CheckCost.isValid())
10064     return false;
10065 
10066   // When interleaving only scalar and vector cost will be equal, which in turn
10067   // would lead to a divide by 0. Fall back to hard threshold.
10068   if (VF.Width.isScalar()) {
10069     if (CheckCost > VectorizeMemoryCheckThreshold) {
10070       LLVM_DEBUG(
10071           dbgs()
10072           << "LV: Interleaving only is not profitable due to runtime checks\n");
10073       return false;
10074     }
10075     return true;
10076   }
10077 
10078   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
10079   double ScalarC = *VF.ScalarCost.getValue();
10080   if (ScalarC == 0)
10081     return true;
10082 
10083   // First, compute the minimum iteration count required so that the vector
10084   // loop outperforms the scalar loop.
10085   //  The total cost of the scalar loop is
10086   //   ScalarC * TC
10087   //  where
10088   //  * TC is the actual trip count of the loop.
10089   //  * ScalarC is the cost of a single scalar iteration.
10090   //
10091   //  The total cost of the vector loop is
10092   //    RtC + VecC * (TC / VF) + EpiC
10093   //  where
10094   //  * RtC is the cost of the generated runtime checks
10095   //  * VecC is the cost of a single vector iteration.
10096   //  * TC is the actual trip count of the loop
10097   //  * VF is the vectorization factor
10098   //  * EpiCost is the cost of the generated epilogue, including the cost
10099   //    of the remaining scalar operations.
10100   //
10101   // Vectorization is profitable once the total vector cost is less than the
10102   // total scalar cost:
10103   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
10104   //
10105   // Now we can compute the minimum required trip count TC as
10106   //   (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
10107   //
10108   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10109   // the computations are performed on doubles, not integers and the result
10110   // is rounded up, hence we get an upper estimate of the TC.
10111   unsigned IntVF = VF.Width.getKnownMinValue();
10112   if (VF.Width.isScalable()) {
10113     unsigned AssumedMinimumVscale = 1;
10114     if (VScale)
10115       AssumedMinimumVscale = *VScale;
10116     IntVF *= AssumedMinimumVscale;
10117   }
10118   double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
10119   double RtC = *CheckCost.getValue();
10120   double MinTC1 = RtC / (ScalarC - VecCOverVF);
10121 
10122   // Second, compute a minimum iteration count so that the cost of the
10123   // runtime checks is only a fraction of the total scalar loop cost. This
10124   // adds a loop-dependent bound on the overhead incurred if the runtime
10125   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10126   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10127   // cost, compute
10128   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
10129   double MinTC2 = RtC * 10 / ScalarC;
10130 
10131   // Now pick the larger minimum. If it is not a multiple of VF, choose the
10132   // next closest multiple of VF. This should partly compensate for ignoring
10133   // the epilogue cost.
10134   uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
10135   VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
10136 
10137   LLVM_DEBUG(
10138       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10139              << VF.MinProfitableTripCount << "\n");
10140 
10141   // Skip vectorization if the expected trip count is less than the minimum
10142   // required trip count.
10143   if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
10144     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
10145                                 VF.MinProfitableTripCount)) {
10146       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10147                            "trip count < minimum profitable VF ("
10148                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
10149                         << ")\n");
10150 
10151       return false;
10152     }
10153   }
10154   return true;
10155 }
10156 
10157 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10158     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10159                                !EnableLoopInterleaving),
10160       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10161                               !EnableLoopVectorization) {}
10162 
10163 bool LoopVectorizePass::processLoop(Loop *L) {
10164   assert((EnableVPlanNativePath || L->isInnermost()) &&
10165          "VPlan-native path is not enabled. Only process inner loops.");
10166 
10167 #ifndef NDEBUG
10168   const std::string DebugLocStr = getDebugLocString(L);
10169 #endif /* NDEBUG */
10170 
10171   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10172                     << L->getHeader()->getParent()->getName() << "' from "
10173                     << DebugLocStr << "\n");
10174 
10175   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10176 
10177   LLVM_DEBUG(
10178       dbgs() << "LV: Loop hints:"
10179              << " force="
10180              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10181                      ? "disabled"
10182                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10183                             ? "enabled"
10184                             : "?"))
10185              << " width=" << Hints.getWidth()
10186              << " interleave=" << Hints.getInterleave() << "\n");
10187 
10188   // Function containing loop
10189   Function *F = L->getHeader()->getParent();
10190 
10191   // Looking at the diagnostic output is the only way to determine if a loop
10192   // was vectorized (other than looking at the IR or machine code), so it
10193   // is important to generate an optimization remark for each loop. Most of
10194   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10195   // generated as OptimizationRemark and OptimizationRemarkMissed are
10196   // less verbose reporting vectorized loops and unvectorized loops that may
10197   // benefit from vectorization, respectively.
10198 
10199   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10200     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10201     return false;
10202   }
10203 
10204   PredicatedScalarEvolution PSE(*SE, *L);
10205 
10206   // Check if it is legal to vectorize the loop.
10207   LoopVectorizationRequirements Requirements;
10208   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10209                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10210   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10211     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10212     Hints.emitRemarkWithHints();
10213     return false;
10214   }
10215 
10216   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10217   // here. They may require CFG and instruction level transformations before
10218   // even evaluating whether vectorization is profitable. Since we cannot modify
10219   // the incoming IR, we need to build VPlan upfront in the vectorization
10220   // pipeline.
10221   if (!L->isInnermost())
10222     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10223                                         ORE, BFI, PSI, Hints, Requirements);
10224 
10225   assert(L->isInnermost() && "Inner loop expected.");
10226 
10227   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10228   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10229 
10230   // If an override option has been passed in for interleaved accesses, use it.
10231   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10232     UseInterleaved = EnableInterleavedMemAccesses;
10233 
10234   // Analyze interleaved memory accesses.
10235   if (UseInterleaved)
10236     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10237 
10238   // Check the function attributes and profiles to find out if this function
10239   // should be optimized for size.
10240   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10241       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI);
10242 
10243   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10244   // count by optimizing for size, to minimize overheads.
10245   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10246   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10247     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10248                       << "This loop is worth vectorizing only if no scalar "
10249                       << "iteration overheads are incurred.");
10250     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10251       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10252     else {
10253       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10254         LLVM_DEBUG(dbgs() << "\n");
10255         SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10256       } else {
10257         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10258                              "small to consider vectorizing.\n");
10259         reportVectorizationFailure(
10260             "The trip count is below the minial threshold value.",
10261             "loop trip count is too low, avoiding vectorization",
10262             "LowTripCount", ORE, L);
10263         Hints.emitRemarkWithHints();
10264         return false;
10265       }
10266     }
10267   }
10268 
10269   // Check the function attributes to see if implicit floats or vectors are
10270   // allowed.
10271   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10272     reportVectorizationFailure(
10273         "Can't vectorize when the NoImplicitFloat attribute is used",
10274         "loop not vectorized due to NoImplicitFloat attribute",
10275         "NoImplicitFloat", ORE, L);
10276     Hints.emitRemarkWithHints();
10277     return false;
10278   }
10279 
10280   // Check if the target supports potentially unsafe FP vectorization.
10281   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10282   // for the target we're vectorizing for, to make sure none of the
10283   // additional fp-math flags can help.
10284   if (Hints.isPotentiallyUnsafe() &&
10285       TTI->isFPVectorizationPotentiallyUnsafe()) {
10286     reportVectorizationFailure(
10287         "Potentially unsafe FP op prevents vectorization",
10288         "loop not vectorized due to unsafe FP support.",
10289         "UnsafeFP", ORE, L);
10290     Hints.emitRemarkWithHints();
10291     return false;
10292   }
10293 
10294   bool AllowOrderedReductions;
10295   // If the flag is set, use that instead and override the TTI behaviour.
10296   if (ForceOrderedReductions.getNumOccurrences() > 0)
10297     AllowOrderedReductions = ForceOrderedReductions;
10298   else
10299     AllowOrderedReductions = TTI->enableOrderedReductions();
10300   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10301     ORE->emit([&]() {
10302       auto *ExactFPMathInst = Requirements.getExactFPInst();
10303       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10304                                                  ExactFPMathInst->getDebugLoc(),
10305                                                  ExactFPMathInst->getParent())
10306              << "loop not vectorized: cannot prove it is safe to reorder "
10307                 "floating-point operations";
10308     });
10309     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10310                          "reorder floating-point operations\n");
10311     Hints.emitRemarkWithHints();
10312     return false;
10313   }
10314 
10315   // Use the cost model.
10316   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10317                                 F, &Hints, IAI);
10318   CM.collectValuesToIgnore();
10319   CM.collectElementTypesForWidening();
10320 
10321   // Use the planner for vectorization.
10322   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);
10323 
10324   // Get user vectorization factor and interleave count.
10325   ElementCount UserVF = Hints.getWidth();
10326   unsigned UserIC = Hints.getInterleave();
10327 
10328   // Plan how to best vectorize, return the best VF and its cost.
10329   std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10330 
10331   VectorizationFactor VF = VectorizationFactor::Disabled();
10332   unsigned IC = 1;
10333 
10334   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10335                            F->getParent()->getDataLayout());
10336   if (MaybeVF) {
10337     VF = *MaybeVF;
10338     // Select the interleave count.
10339     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10340 
10341     unsigned SelectedIC = std::max(IC, UserIC);
10342     //  Optimistically generate runtime checks if they are needed. Drop them if
10343     //  they turn out to not be profitable.
10344     if (VF.Width.isVector() || SelectedIC > 1)
10345       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10346 
10347     // Check if it is profitable to vectorize with runtime checks.
10348     bool ForceVectorization =
10349         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10350     if (!ForceVectorization &&
10351         !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L,
10352                                     *PSE.getSE())) {
10353       ORE->emit([&]() {
10354         return OptimizationRemarkAnalysisAliasing(
10355                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10356                    L->getHeader())
10357                << "loop not vectorized: cannot prove it is safe to reorder "
10358                   "memory operations";
10359       });
10360       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10361       Hints.emitRemarkWithHints();
10362       return false;
10363     }
10364   }
10365 
10366   // Identify the diagnostic messages that should be produced.
10367   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10368   bool VectorizeLoop = true, InterleaveLoop = true;
10369   if (VF.Width.isScalar()) {
10370     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10371     VecDiagMsg = std::make_pair(
10372         "VectorizationNotBeneficial",
10373         "the cost-model indicates that vectorization is not beneficial");
10374     VectorizeLoop = false;
10375   }
10376 
10377   if (!MaybeVF && UserIC > 1) {
10378     // Tell the user interleaving was avoided up-front, despite being explicitly
10379     // requested.
10380     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10381                          "interleaving should be avoided up front\n");
10382     IntDiagMsg = std::make_pair(
10383         "InterleavingAvoided",
10384         "Ignoring UserIC, because interleaving was avoided up front");
10385     InterleaveLoop = false;
10386   } else if (IC == 1 && UserIC <= 1) {
10387     // Tell the user interleaving is not beneficial.
10388     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10389     IntDiagMsg = std::make_pair(
10390         "InterleavingNotBeneficial",
10391         "the cost-model indicates that interleaving is not beneficial");
10392     InterleaveLoop = false;
10393     if (UserIC == 1) {
10394       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10395       IntDiagMsg.second +=
10396           " and is explicitly disabled or interleave count is set to 1";
10397     }
10398   } else if (IC > 1 && UserIC == 1) {
10399     // Tell the user interleaving is beneficial, but it explicitly disabled.
10400     LLVM_DEBUG(
10401         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10402     IntDiagMsg = std::make_pair(
10403         "InterleavingBeneficialButDisabled",
10404         "the cost-model indicates that interleaving is beneficial "
10405         "but is explicitly disabled or interleave count is set to 1");
10406     InterleaveLoop = false;
10407   }
10408 
10409   // Override IC if user provided an interleave count.
10410   IC = UserIC > 0 ? UserIC : IC;
10411 
10412   // Emit diagnostic messages, if any.
10413   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10414   if (!VectorizeLoop && !InterleaveLoop) {
10415     // Do not vectorize or interleaving the loop.
10416     ORE->emit([&]() {
10417       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10418                                       L->getStartLoc(), L->getHeader())
10419              << VecDiagMsg.second;
10420     });
10421     ORE->emit([&]() {
10422       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10423                                       L->getStartLoc(), L->getHeader())
10424              << IntDiagMsg.second;
10425     });
10426     return false;
10427   } else if (!VectorizeLoop && InterleaveLoop) {
10428     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10429     ORE->emit([&]() {
10430       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10431                                         L->getStartLoc(), L->getHeader())
10432              << VecDiagMsg.second;
10433     });
10434   } else if (VectorizeLoop && !InterleaveLoop) {
10435     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10436                       << ") in " << DebugLocStr << '\n');
10437     ORE->emit([&]() {
10438       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10439                                         L->getStartLoc(), L->getHeader())
10440              << IntDiagMsg.second;
10441     });
10442   } else if (VectorizeLoop && InterleaveLoop) {
10443     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10444                       << ") in " << DebugLocStr << '\n');
10445     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10446   }
10447 
10448   bool DisableRuntimeUnroll = false;
10449   MDNode *OrigLoopID = L->getLoopID();
10450   {
10451     using namespace ore;
10452     if (!VectorizeLoop) {
10453       assert(IC > 1 && "interleave count should not be 1 or 0");
10454       // If we decided that it is not legal to vectorize the loop, then
10455       // interleave it.
10456       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10457                                  &CM, BFI, PSI, Checks);
10458 
10459       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10460       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10461 
10462       ORE->emit([&]() {
10463         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10464                                   L->getHeader())
10465                << "interleaved loop (interleaved count: "
10466                << NV("InterleaveCount", IC) << ")";
10467       });
10468     } else {
10469       // If we decided that it is *legal* to vectorize the loop, then do it.
10470 
10471       // Consider vectorizing the epilogue too if it's profitable.
10472       VectorizationFactor EpilogueVF =
10473           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10474       if (EpilogueVF.Width.isVector()) {
10475 
10476         // The first pass vectorizes the main loop and creates a scalar epilogue
10477         // to be vectorized by executing the plan (potentially with a different
10478         // factor) again shortly afterwards.
10479         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10480         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10481                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10482 
10483         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10484         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10485                         DT, true);
10486         ++LoopsVectorized;
10487 
10488         // Second pass vectorizes the epilogue and adjusts the control flow
10489         // edges from the first pass.
10490         EPI.MainLoopVF = EPI.EpilogueVF;
10491         EPI.MainLoopUF = EPI.EpilogueUF;
10492         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10493                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10494                                                  Checks);
10495 
10496         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10497         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10498         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10499         Header->setName("vec.epilog.vector.body");
10500 
10501         // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10502         // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10503         // before vectorizing the epilogue loop.
10504         for (VPRecipeBase &R : Header->phis()) {
10505           if (isa<VPCanonicalIVPHIRecipe>(&R))
10506             continue;
10507 
10508           Value *ResumeV = nullptr;
10509           // TODO: Move setting of resume values to prepareToExecute.
10510           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10511             ResumeV = MainILV.getReductionResumeValue(
10512                 ReductionPhi->getRecurrenceDescriptor());
10513           } else {
10514             // Create induction resume values for both widened pointer and
10515             // integer/fp inductions and update the start value of the induction
10516             // recipes to use the resume value.
10517             PHINode *IndPhi = nullptr;
10518             const InductionDescriptor *ID;
10519             if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10520               IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
10521               ID = &Ind->getInductionDescriptor();
10522             } else {
10523               auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10524               IndPhi = WidenInd->getPHINode();
10525               ID = &WidenInd->getInductionDescriptor();
10526             }
10527 
10528             ResumeV = MainILV.createInductionResumeValue(
10529                 IndPhi, *ID, {EPI.MainLoopIterationCountCheck});
10530           }
10531           assert(ResumeV && "Must have a resume value");
10532           VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV);
10533           cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10534         }
10535 
10536         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10537                         DT, true);
10538         ++LoopsEpilogueVectorized;
10539 
10540         if (!MainILV.areSafetyChecksAdded())
10541           DisableRuntimeUnroll = true;
10542       } else {
10543         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10544                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10545                                PSI, Checks);
10546 
10547         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10548         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10549         ++LoopsVectorized;
10550 
10551         // Add metadata to disable runtime unrolling a scalar loop when there
10552         // are no runtime checks about strides and memory. A scalar loop that is
10553         // rarely used is not worth unrolling.
10554         if (!LB.areSafetyChecksAdded())
10555           DisableRuntimeUnroll = true;
10556       }
10557       // Report the vectorization decision.
10558       ORE->emit([&]() {
10559         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10560                                   L->getHeader())
10561                << "vectorized loop (vectorization width: "
10562                << NV("VectorizationFactor", VF.Width)
10563                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10564       });
10565     }
10566 
10567     if (ORE->allowExtraAnalysis(LV_NAME))
10568       checkMixedPrecision(L, ORE);
10569   }
10570 
10571   std::optional<MDNode *> RemainderLoopID =
10572       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10573                                       LLVMLoopVectorizeFollowupEpilogue});
10574   if (RemainderLoopID) {
10575     L->setLoopID(*RemainderLoopID);
10576   } else {
10577     if (DisableRuntimeUnroll)
10578       AddRuntimeUnrollDisableMetaData(L);
10579 
10580     // Mark the loop as already vectorized to avoid vectorizing again.
10581     Hints.setAlreadyVectorized();
10582   }
10583 
10584   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10585   return true;
10586 }
10587 
10588 LoopVectorizeResult LoopVectorizePass::runImpl(
10589     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10590     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10591     DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10592     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10593   SE = &SE_;
10594   LI = &LI_;
10595   TTI = &TTI_;
10596   DT = &DT_;
10597   BFI = &BFI_;
10598   TLI = TLI_;
10599   AC = &AC_;
10600   LAIs = &LAIs_;
10601   DB = &DB_;
10602   ORE = &ORE_;
10603   PSI = PSI_;
10604 
10605   // Don't attempt if
10606   // 1. the target claims to have no vector registers, and
10607   // 2. interleaving won't help ILP.
10608   //
10609   // The second condition is necessary because, even if the target has no
10610   // vector registers, loop vectorization may still enable scalar
10611   // interleaving.
10612   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10613       TTI->getMaxInterleaveFactor(1) < 2)
10614     return LoopVectorizeResult(false, false);
10615 
10616   bool Changed = false, CFGChanged = false;
10617 
10618   // The vectorizer requires loops to be in simplified form.
10619   // Since simplification may add new inner loops, it has to run before the
10620   // legality and profitability checks. This means running the loop vectorizer
10621   // will simplify all loops, regardless of whether anything end up being
10622   // vectorized.
10623   for (const auto &L : *LI)
10624     Changed |= CFGChanged |=
10625         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10626 
10627   // Build up a worklist of inner-loops to vectorize. This is necessary as
10628   // the act of vectorizing or partially unrolling a loop creates new loops
10629   // and can invalidate iterators across the loops.
10630   SmallVector<Loop *, 8> Worklist;
10631 
10632   for (Loop *L : *LI)
10633     collectSupportedLoops(*L, LI, ORE, Worklist);
10634 
10635   LoopsAnalyzed += Worklist.size();
10636 
10637   // Now walk the identified inner loops.
10638   while (!Worklist.empty()) {
10639     Loop *L = Worklist.pop_back_val();
10640 
10641     // For the inner loops we actually process, form LCSSA to simplify the
10642     // transform.
10643     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10644 
10645     Changed |= CFGChanged |= processLoop(L);
10646 
10647     if (Changed)
10648       LAIs->clear();
10649   }
10650 
10651   // Process each loop nest in the function.
10652   return LoopVectorizeResult(Changed, CFGChanged);
10653 }
10654 
10655 PreservedAnalyses LoopVectorizePass::run(Function &F,
10656                                          FunctionAnalysisManager &AM) {
10657     auto &LI = AM.getResult<LoopAnalysis>(F);
10658     // There are no loops in the function. Return before computing other expensive
10659     // analyses.
10660     if (LI.empty())
10661       return PreservedAnalyses::all();
10662     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10663     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10664     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10665     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10666     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10667     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10668     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10669     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10670 
10671     LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
10672     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10673     ProfileSummaryInfo *PSI =
10674         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10675     LoopVectorizeResult Result =
10676         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
10677     if (!Result.MadeAnyChange)
10678       return PreservedAnalyses::all();
10679     PreservedAnalyses PA;
10680 
10681     // We currently do not preserve loopinfo/dominator analyses with outer loop
10682     // vectorization. Until this is addressed, mark these analyses as preserved
10683     // only for non-VPlan-native path.
10684     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10685     if (!EnableVPlanNativePath) {
10686       PA.preserve<LoopAnalysis>();
10687       PA.preserve<DominatorTreeAnalysis>();
10688     }
10689 
10690     if (Result.MadeCFGChange) {
10691       // Making CFG changes likely means a loop got vectorized. Indicate that
10692       // extra simplification passes should be run.
10693       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10694       // be run if runtime checks have been added.
10695       AM.getResult<ShouldRunExtraVectorPasses>(F);
10696       PA.preserve<ShouldRunExtraVectorPasses>();
10697     } else {
10698       PA.preserveSet<CFGAnalyses>();
10699     }
10700     return PA;
10701 }
10702 
10703 void LoopVectorizePass::printPipeline(
10704     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10705   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10706       OS, MapClassName2PassName);
10707 
10708   OS << "<";
10709   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10710   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10711   OS << ">";
10712 }
10713